#!/usr/bin/env python3
"""
淘宝智能数据提取器
策略：使用JavaScript直接从页面获取数据
"""

import json
import time
import random
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

OUTPUT_DIR = "taobao_smart_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def log(message, level="INFO"):
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
]

def setup_driver():
    """配置Chrome驱动"""
    chrome_options = Options()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument(f'user-agent={random.choice(USER_AGENTS)}')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.binary_location = '/data/data/com.termux/files/usr/bin/chromium-browser'

    chromedriver_path = '/data/data/com.termux/files/usr/bin/chromedriver'
    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # 隐藏webdriver特征
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
    })

    return driver

def extract_with_javascript(driver):
    """使用JavaScript提取所有可见文本和链接"""
    script = """
    function extractData() {
        const results = [];

        // 查找所有可能的商品容器
        const selectors = [
            '[class*="item"]',
            '[class*="Item"]',
            '[class*="product"]',
            '[class*="Product"]',
            '[class*="card"]',
            '[class*="Card"]',
            '[data-spm]'
        ];

        const allElements = new Set();
        selectors.forEach(selector => {
            document.querySelectorAll(selector).forEach(el => allElements.add(el));
        });

        allElements.forEach((element, index) => {
            // 跳过太小的元素
            if (element.offsetHeight < 50 || element.offsetWidth < 100) return;

            const item = {
                index: index,
                text: element.innerText?.substring(0, 500) || '',
                classes: element.className,
                html: element.outerHTML.substring(0, 1000)
            };

            // 查找链接
            const link = element.querySelector('a[href]');
            if (link) {
                item.url = link.href;
            }

            // 查找图片
            const img = element.querySelector('img[src], img[data-src]');
            if (img) {
                item.image = img.src || img.getAttribute('data-src') || '';
            }

            // 查找价格相关
            const priceEl = element.querySelector('[class*="price"], [class*="Price"]');
            if (priceEl) {
                item.price = priceEl.innerText;
            }

            // 只保存有内容的
            if (item.text.length > 10 || item.url) {
                results.push(item);
            }
        });

        return results;
    }

    return extractData();
    """

    try:
        data = driver.execute_script(script)
        return data
    except Exception as e:
        log(f"JavaScript执行失败: {e}", "ERROR")
        return []

def scrape_keyword_smart(driver, keyword):
    """智能爬取单个关键词"""
    log(f"\n{'='*60}")
    log(f"爬取: {keyword}")
    log(f"{'='*60}")

    from urllib.parse import quote
    url = f"https://s.taobao.com/search?q={quote(keyword)}"

    try:
        driver.get(url)
        log(f"等待页面加载...")

        # 等待更长时间
        time.sleep(random.randint(15, 20))

        # 滚动
        for i in range(3):
            driver.execute_script(f"window.scrollBy(0, {random.randint(400, 800)});")
            time.sleep(random.uniform(1, 2))

        # 等待
        time.sleep(3)

        # 使用JavaScript提取
        log("使用JavaScript提取数据...")
        raw_data = extract_with_javascript(driver)

        log(f"原始数据: {len(raw_data)} 条")

        # 处理和清理数据
        products = []
        for item in raw_data:
            if len(item.get('text', '')) > 20:  # 过滤太短的
                product = {
                    'keyword': keyword,
                    'text': item.get('text', '')[:200],
                    'url': item.get('url', ''),
                    'price': item.get('price', ''),
                    'image': item.get('image', ''),
                    'classes': item.get('classes', '')[:100]
                }
                products.append(product)

        log(f"✓ 提取到 {len(products)} 个有效数据项", "SUCCESS")

        # 保存原始HTML
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        html_file = os.path.join(OUTPUT_DIR, f"page_{keyword}_{timestamp}.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)

        return products

    except Exception as e:
        log(f"✗ 错误: {e}", "ERROR")
        return []

def batch_scrape_smart(keywords_file, max_keywords=10):
    """智能批量爬取"""
    log("="*70)
    log("淘宝智能批量爬虫")
    log("="*70)

    # 读取关键词
    with open(keywords_file, 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()][:max_keywords]

    log(f"本次爬取: {len(keywords)} 个关键词\n")

    driver = setup_driver()
    all_products = []

    try:
        for idx, keyword in enumerate(keywords, 1):
            log(f"进度: [{idx}/{len(keywords)}]")

            products = scrape_keyword_smart(driver, keyword)
            all_products.extend(products)

            # 延迟
            if idx < len(keywords):
                delay = random.randint(5, 10)
                log(f"等待 {delay} 秒...\n")
                time.sleep(delay)

        # 保存数据
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        if all_products:
            # JSON
            json_file = os.path.join(OUTPUT_DIR, f"products_{timestamp}.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(all_products, f, ensure_ascii=False, indent=2)
            log(f"\n✓ JSON: {json_file}")

            # CSV
            csv_file = os.path.join(OUTPUT_DIR, f"products_{timestamp}.csv")
            df = pd.DataFrame(all_products)
            df.to_csv(csv_file, index=False, encoding='utf-8-sig')
            log(f"✓ CSV: {csv_file}")

            # 统计
            log(f"\n{'='*70}")
            log(f"总计: {len(all_products)} 条数据")
            log(f"平均: {len(all_products)/len(keywords):.1f} 条/关键词")

            # 样本
            log(f"\n数据样本:")
            for i, p in enumerate(all_products[:5], 1):
                log(f"\n[{i}] {p.get('keyword')}")
                log(f"    {p.get('text', '')[:80]}...")
                log(f"    价格: {p.get('price', 'N/A')}")

        else:
            log("\n⚠ 未提取到数据", "WARNING")

    finally:
        driver.quit()

if __name__ == "__main__":
    import sys
    max_kw = 5 if len(sys.argv) < 2 else int(sys.argv[1])
    batch_scrape_smart('daily_goods_100.txt', max_keywords=max_kw)
