#!/usr/bin/env python3
"""
淘宝爬虫 - Termux优化版
使用Selenium + Chromium
"""

import json
import time
import os
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

# 创建输出目录
OUTPUT_DIR = "taobao_selenium_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def log(message, level="INFO"):
    """打印带时间戳的日志"""
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

def setup_driver():
    """配置Chromium驱动 - Termux优化"""
    log("配置Chromium驱动...")

    chrome_options = Options()

    # 基础选项
    chrome_options.add_argument('--headless=new')  # 新版headless模式
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-software-rasterizer')

    # 性能优化
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-images')  # 禁用图片加载
    chrome_options.add_argument('--blink-settings=imagesEnabled=false')

    # 窗口大小
    chrome_options.add_argument('--window-size=1920,1080')

    # User Agent
    chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Termux特定配置
    chrome_options.binary_location = '/data/data/com.termux/files/usr/bin/chromium-browser'

    try:
        # 明确指定ChromeDriver路径
        chromedriver_path = '/data/data/com.termux/files/usr/bin/chromedriver'
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        log("✓ Chromium驱动初始化成功", "SUCCESS")
        return driver
    except Exception as e:
        log(f"✗ 驱动初始化失败: {e}", "ERROR")

        # 尝试不指定binary location
        try:
            log("尝试备用方法（不指定binary location）...")
            chrome_options.binary_location = None
            chromedriver_path = '/data/data/com.termux/files/usr/bin/chromedriver'
            service = Service(executable_path=chromedriver_path)
            driver = webdriver.Chrome(service=service, options=chrome_options)
            log("✓ 使用备用方法成功", "SUCCESS")
            return driver
        except Exception as e2:
            log(f"✗ 备用方法也失败: {e2}", "ERROR")
            return None

def scrape_taobao(url, wait_time=10, max_scrolls=3):
    """爬取淘宝页面"""
    log("="*70)
    log("开始爬取淘宝搜索结果")
    log("="*70)

    driver = setup_driver()
    if not driver:
        log("无法初始化浏览器驱动", "ERROR")
        return None

    all_data = []
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    try:
        # 访问页面
        log(f"访问URL: {url[:80]}...")
        driver.get(url)
        log(f"✓ 页面加载完成")

        # 等待页面渲染
        log(f"等待 {wait_time} 秒让JavaScript执行...")
        time.sleep(wait_time)

        # 保存初始页面
        html_file = os.path.join(OUTPUT_DIR, f"page_initial_{timestamp}.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        log(f"✓ 已保存初始页面: {html_file}")

        # 滚动加载更多内容
        log(f"开始滚动页面 ({max_scrolls}次)...")
        for i in range(max_scrolls):
            scroll_height = driver.execute_script("return document.body.scrollHeight")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            log(f"  滚动 {i+1}/{max_scrolls} (高度: {scroll_height}px)")
            time.sleep(3)

        # 保存滚动后的页面
        html_file_scroll = os.path.join(OUTPUT_DIR, f"page_scrolled_{timestamp}.html")
        with open(html_file_scroll, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        log(f"✓ 已保存滚动后页面: {html_file_scroll}")

        # 尝试保存截图
        try:
            screenshot_file = os.path.join(OUTPUT_DIR, f"screenshot_{timestamp}.png")
            driver.save_screenshot(screenshot_file)
            log(f"✓ 已保存截图: {screenshot_file}")
        except Exception as e:
            log(f"截图失败: {e}", "WARNING")

        # 解析页面
        log("\n开始解析页面内容...")
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 统计页面元素
        stats = {
            'div': len(soup.find_all('div')),
            'a': len(soup.find_all('a')),
            'img': len(soup.find_all('img')),
            'script': len(soup.find_all('script'))
        }
        log(f"页面元素统计: {stats}")

        # 查找商品/店铺容器
        log("\n尝试提取商品/店铺数据...")

        # 策略1: 查找包含特定class的元素
        keywords = ['item', 'shop', 'card', 'product', 'goods']
        for keyword in keywords:
            elements = soup.find_all(class_=lambda x: x and keyword in str(x).lower())
            if elements:
                log(f"  策略[class*={keyword}]: 找到 {len(elements)} 个元素")

                for elem in elements[:20]:  # 限制前20个
                    item = {
                        'type': elem.name,
                        'class': ' '.join(elem.get('class', [])),
                        'text': elem.get_text(strip=True)[:200],
                    }

                    # 查找链接
                    link = elem.find('a', href=True)
                    if link:
                        item['url'] = link['href']
                    elif elem.name == 'a' and elem.get('href'):
                        item['url'] = elem['href']

                    # 查找图片
                    img = elem.find('img')
                    if img:
                        item['image'] = img.get('src') or img.get('data-src', '')

                    if item['text'] or item.get('url'):
                        all_data.append(item)

                if all_data:
                    break

        # 策略2: 如果没找到，直接查找所有链接
        if not all_data:
            log("  策略[查找所有链接]...")
            links = soup.find_all('a', href=True)
            for link in links[:30]:
                text = link.get_text(strip=True)
                if text and len(text) > 10:  # 过滤太短的文本
                    all_data.append({
                        'text': text[:200],
                        'url': link['href'],
                        'class': ' '.join(link.get('class', []))
                    })

        log(f"\n✓ 提取到 {len(all_data)} 条数据")

        # 保存数据
        if all_data:
            # JSON
            json_file = os.path.join(OUTPUT_DIR, f"data_{timestamp}.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(all_data, f, ensure_ascii=False, indent=2)
            log(f"✓ 已保存JSON: {json_file}")

            # CSV
            try:
                csv_file = os.path.join(OUTPUT_DIR, f"data_{timestamp}.csv")
                df = pd.DataFrame(all_data)
                df.to_csv(csv_file, index=False, encoding='utf-8-sig')
                log(f"✓ 已保存CSV: {csv_file}")
            except Exception as e:
                log(f"CSV保存失败: {e}", "WARNING")

            # 数据预览
            log("\n数据预览 (前5条):")
            for i, item in enumerate(all_data[:5], 1):
                log(f"\n  [{i}] {item.get('text', 'N/A')[:60]}")
                if 'url' in item:
                    log(f"      URL: {item['url'][:70]}")
        else:
            log("⚠ 未能提取到数据", "WARNING")
            log("  建议检查保存的HTML文件分析页面结构")

        # 生成报告
        report = {
            'timestamp': timestamp,
            'url': url,
            'wait_time': wait_time,
            'max_scrolls': max_scrolls,
            'page_stats': stats,
            'data_count': len(all_data),
            'files': {
                'html_initial': html_file,
                'html_scrolled': html_file_scroll,
                'data_json': json_file if all_data else None
            }
        }

        report_file = os.path.join(OUTPUT_DIR, f"report_{timestamp}.json")
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)
        log(f"\n✓ 报告已保存: {report_file}")

    except Exception as e:
        log(f"\n✗ 爬取过程出错: {e}", "ERROR")
        import traceback
        log(traceback.format_exc(), "ERROR")

    finally:
        log("\n关闭浏览器...")
        driver.quit()
        log("✓ 浏览器已关闭")

    log("\n" + "="*70)
    log(f"爬取完成! 输出目录: {OUTPUT_DIR}/")
    log("="*70)

    return all_data

def main():
    url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=13&ie=utf8&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

    # 参数说明：
    # wait_time: 等待页面加载的秒数（建议10-15秒）
    # max_scrolls: 滚动次数（建议3-5次）

    scrape_taobao(url, wait_time=10, max_scrolls=3)

if __name__ == "__main__":
    main()
