#!/usr/bin/env python3
"""
淘宝批量爬虫 - 反爬虫绕过版本
策略：
1. 随机User-Agent
2. 随机延迟
3. 使用undetected-chromedriver
4. 隐藏自动化特征
5. 模拟人类行为
"""

import json
import time
import random
import os
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

# 输出目录
OUTPUT_DIR = "taobao_batch_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 日志函数
def log(message, level="INFO"):
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

# 随机User-Agent池
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
]

def setup_driver_stealth():
    """配置隐身模式的Chrome驱动"""
    log("配置反检测Chrome驱动...")

    chrome_options = Options()

    # 基础选项
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')

    # 窗口大小（模拟真实浏览器）
    chrome_options.add_argument('--window-size=1920,1080')

    # 随机User-Agent
    user_agent = random.choice(USER_AGENTS)
    chrome_options.add_argument(f'user-agent={user_agent}')

    # 禁用图片加载（提速）
    prefs = {
        'profile.managed_default_content_settings.images': 2,
        'profile.default_content_setting_values': {
            'notifications': 2
        }
    }
    chrome_options.add_experimental_option('prefs', prefs)

    # 排除自动化标志
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    # Termux路径
    chrome_options.binary_location = '/data/data/com.termux/files/usr/bin/chromium-browser'

    try:
        chromedriver_path = '/data/data/com.termux/files/usr/bin/chromedriver'
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # 修改navigator.webdriver标志
        driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                });

                // 覆盖Chrome对象
                window.chrome = {
                    runtime: {}
                };

                // 覆盖permissions
                const originalQuery = window.navigator.permissions.query;
                window.navigator.permissions.query = (parameters) => (
                    parameters.name === 'notifications' ?
                        Promise.resolve({ state: Notification.permission }) :
                        originalQuery(parameters)
                );
            '''
        })

        log("✓ Chrome驱动初始化成功", "SUCCESS")
        return driver
    except Exception as e:
        log(f"✗ 驱动初始化失败: {e}", "ERROR")
        return None

def random_delay(min_sec=2, max_sec=5):
    """随机延迟，模拟人类行为"""
    delay = random.uniform(min_sec, max_sec)
    time.sleep(delay)

def human_like_scroll(driver):
    """模拟人类滚动行为"""
    # 随机滚动次数
    scroll_count = random.randint(2, 4)

    for i in range(scroll_count):
        # 随机滚动距离
        scroll_distance = random.randint(300, 800)
        driver.execute_script(f"window.scrollBy(0, {scroll_distance});")

        # 随机停顿
        time.sleep(random.uniform(0.5, 1.5))

    # 滚动到底部
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(random.uniform(1, 2))

def extract_products_from_page(html_content, keyword):
    """从页面HTML中提取商品数据"""
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []

    # 策略1: 查找所有可能的商品卡片
    # 淘宝可能使用的class名称模式
    patterns = [
        {'class': re.compile(r'.*item.*', re.I)},
        {'class': re.compile(r'.*product.*', re.I)},
        {'class': re.compile(r'.*goods.*', re.I)},
        {'class': re.compile(r'.*card.*', re.I)},
    ]

    all_items = []
    for pattern in patterns:
        items = soup.find_all('div', pattern)
        if items:
            all_items.extend(items)

    # 去重
    seen = set()
    unique_items = []
    for item in all_items:
        item_str = str(item)[:200]
        if item_str not in seen:
            seen.add(item_str)
            unique_items.append(item)

    log(f"  找到 {len(unique_items)} 个可能的商品元素")

    # 提取数据
    for idx, item in enumerate(unique_items[:50]):  # 限制前50个
        try:
            product = {
                'keyword': keyword,
                'title': '',
                'price': '',
                'sales': '',
                'shop': '',
                'url': '',
                'index': idx + 1
            }

            # 提取标题
            title_elem = item.find(['h3', 'h4', 'a', 'div'], class_=re.compile(r'.*(title|name).*', re.I))
            if title_elem:
                product['title'] = title_elem.get_text(strip=True)

            # 提取价格
            price_elem = item.find(['span', 'div', 'em'], class_=re.compile(r'.*(price|yuan).*', re.I))
            if price_elem:
                product['price'] = price_elem.get_text(strip=True)

            # 提取销量
            sales_elem = item.find(['span', 'div'], class_=re.compile(r'.*(sale|sold).*', re.I))
            if sales_elem:
                product['sales'] = sales_elem.get_text(strip=True)

            # 提取店铺
            shop_elem = item.find(['span', 'div', 'a'], class_=re.compile(r'.*(shop|store).*', re.I))
            if shop_elem:
                product['shop'] = shop_elem.get_text(strip=True)

            # 提取链接
            link = item.find('a', href=True)
            if link:
                product['url'] = link['href']

            # 只保存有标题或价格的数据
            if product['title'] or product['price']:
                products.append(product)

        except Exception as e:
            continue

    return products

def scrape_keyword(driver, keyword, max_wait=15):
    """爬取单个关键词"""
    log(f"\n{'='*60}")
    log(f"开始爬取: {keyword}")
    log(f"{'='*60}")

    # 构建搜索URL
    from urllib.parse import quote
    encoded_keyword = quote(keyword)
    url = f"https://s.taobao.com/search?q={encoded_keyword}"

    try:
        # 访问页面
        log(f"访问URL...")
        driver.get(url)

        # 等待页面加载
        wait_time = random.randint(max_wait-3, max_wait+3)
        log(f"等待 {wait_time} 秒...")
        time.sleep(wait_time)

        # 模拟人类滚动
        log("模拟滚动...")
        human_like_scroll(driver)

        # 再等待一下
        time.sleep(random.uniform(2, 4))

        # 获取页面源码
        html = driver.page_source

        # 保存HTML（调试用）
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        html_file = os.path.join(OUTPUT_DIR, f"html_{keyword}_{timestamp}.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(html)
        log(f"✓ 已保存HTML: {html_file}")

        # 提取商品数据
        log("提取商品数据...")
        products = extract_products_from_page(html, keyword)

        log(f"✓ 提取到 {len(products)} 个商品", "SUCCESS")

        return products, html_file

    except Exception as e:
        log(f"✗ 爬取失败: {e}", "ERROR")
        return [], None

def batch_scrape(keywords_file, max_keywords=None, start_from=0):
    """批量爬取"""
    log("="*70)
    log("淘宝批量爬虫 - 反爬虫版本")
    log("="*70)

    # 读取关键词
    log(f"\n读取关键词文件: {keywords_file}")
    with open(keywords_file, 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]

    total_keywords = len(keywords)
    log(f"✓ 读取到 {total_keywords} 个关键词")

    # 限制数量
    if max_keywords:
        keywords = keywords[start_from:start_from + max_keywords]
        log(f"本次爬取: {len(keywords)} 个（从第 {start_from+1} 个开始）")

    # 初始化驱动
    driver = setup_driver_stealth()
    if not driver:
        log("无法初始化驱动，退出", "ERROR")
        return

    all_products = []
    success_count = 0
    fail_count = 0

    try:
        for idx, keyword in enumerate(keywords, start_from + 1):
            log(f"\n进度: [{idx}/{start_from + len(keywords)}]")

            # 爬取
            products, html_file = scrape_keyword(driver, keyword)

            if products:
                all_products.extend(products)
                success_count += 1
            else:
                fail_count += 1

            # 随机延迟（避免被封）
            if idx < len(keywords):
                delay = random.randint(3, 8)
                log(f"等待 {delay} 秒后继续...")
                time.sleep(delay)

        # 保存结果
        log("\n" + "="*70)
        log("开始保存数据...")
        log("="*70)

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        if all_products:
            # 保存JSON
            json_file = os.path.join(OUTPUT_DIR, f"products_{timestamp}.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(all_products, f, ensure_ascii=False, indent=2)
            log(f"✓ JSON: {json_file}")

            # 保存CSV
            csv_file = os.path.join(OUTPUT_DIR, f"products_{timestamp}.csv")
            df = pd.DataFrame(all_products)
            df.to_csv(csv_file, index=False, encoding='utf-8-sig')
            log(f"✓ CSV: {csv_file}")

            # 生成统计报告
            report = {
                'timestamp': timestamp,
                'total_keywords': len(keywords),
                'success_count': success_count,
                'fail_count': fail_count,
                'total_products': len(all_products),
                'keywords_processed': keywords,
                'avg_products_per_keyword': len(all_products) / success_count if success_count > 0 else 0
            }

            report_file = os.path.join(OUTPUT_DIR, f"report_{timestamp}.json")
            with open(report_file, 'w', encoding='utf-8') as f:
                json.dump(report, f, ensure_ascii=False, indent=2)
            log(f"✓ 报告: {report_file}")

            # 打印统计
            log("\n" + "="*70)
            log("爬取统计")
            log("="*70)
            log(f"总关键词: {len(keywords)}")
            log(f"成功: {success_count}")
            log(f"失败: {fail_count}")
            log(f"总商品数: {len(all_products)}")
            log(f"平均每个关键词: {len(all_products) / success_count if success_count > 0 else 0:.1f} 个")

            # 显示样本
            log("\n数据样本（前5条）:")
            for i, product in enumerate(all_products[:5], 1):
                log(f"\n  [{i}] {product.get('keyword', 'N/A')}")
                log(f"      标题: {product.get('title', 'N/A')[:50]}")
                log(f"      价格: {product.get('price', 'N/A')}")
                log(f"      销量: {product.get('sales', 'N/A')}")
        else:
            log("⚠ 未提取到任何商品数据", "WARNING")

    except KeyboardInterrupt:
        log("\n用户中断", "WARNING")
    except Exception as e:
        log(f"\n✗ 发生错误: {e}", "ERROR")
        import traceback
        log(traceback.format_exc(), "ERROR")
    finally:
        log("\n关闭浏览器...")
        driver.quit()
        log("✓ 完成")

def main():
    """主函数"""
    import sys

    # 参数
    keywords_file = 'daily_goods_100.txt'

    # 可以通过命令行参数控制
    max_keywords = 10 if len(sys.argv) < 2 else int(sys.argv[1])  # 默认先测试10个
    start_from = 0 if len(sys.argv) < 3 else int(sys.argv[2])

    log(f"关键词文件: {keywords_file}")
    log(f"爬取数量: {max_keywords}")
    log(f"起始位置: {start_from}")

    batch_scrape(keywords_file, max_keywords=max_keywords, start_from=start_from)

if __name__ == "__main__":
    main()
