#!/usr/bin/env python3
"""
淘宝终极爬虫 - 基于GitHub最佳实践
结合undetected-chromedriver和Cookie池策略
支持命令行自定义规则采集淘宝商品数据
"""

import os
import sys
import json
import time
import random
import argparse
import logging
import pickle
from datetime import datetime
from pathlib import Path
import re

try:
    import undetected_chromedriver as uc
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException, NoSuchElementException
except ImportError:
    print("错误: 缺少依赖库")
    print("请运行: pip install undetected-chromedriver selenium")
    sys.exit(1)

# 配置
COOKIE_DIR = "taobao_cookies"
OUTPUT_DIR = "taobao_scraped_data"
LOG_DIR = "taobao_logs"

for d in [COOKIE_DIR, OUTPUT_DIR, LOG_DIR]:
    os.makedirs(d, exist_ok=True)

# 日志配置
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = os.path.join(LOG_DIR, f"scraper_{timestamp}.log")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)


class CookieManager:
    """Cookie管理器 - 实现Cookie池策略"""

    def __init__(self, cookie_dir=COOKIE_DIR):
        self.cookie_dir = cookie_dir
        self.cookie_file = os.path.join(cookie_dir, "taobao_cookies.pkl")

    def save_cookies(self, driver):
        """保存cookies"""
        cookies = driver.get_cookies()
        with open(self.cookie_file, 'wb') as f:
            pickle.dump(cookies, f)
        logger.info(f"✓ Cookies已保存: {len(cookies)}个")
        return cookies

    def load_cookies(self, driver):
        """加载cookies"""
        if not os.path.exists(self.cookie_file):
            logger.warning("⚠ Cookie文件不存在")
            return False

        try:
            with open(self.cookie_file, 'rb') as f:
                cookies = pickle.load(f)

            # 先访问淘宝主页
            driver.get("https://www.taobao.com")
            time.sleep(2)

            # 添加cookies
            for cookie in cookies:
                try:
                    driver.add_cookie(cookie)
                except Exception as e:
                    logger.debug(f"跳过无效cookie: {e}")

            logger.info(f"✓ 已加载 {len(cookies)} 个cookies")

            # 刷新页面使cookie生效
            driver.refresh()
            time.sleep(2)

            return True

        except Exception as e:
            logger.error(f"✗ 加载cookies失败: {e}")
            return False

    def is_logged_in(self, driver):
        """检查是否已登录"""
        try:
            driver.get("https://www.taobao.com")
            time.sleep(3)

            # 检查是否有登录标识
            page_source = driver.page_source

            if "请登录" in page_source or "login" in driver.current_url.lower():
                return False

            # 尝试查找用户名元素
            try:
                driver.find_element(By.CLASS_NAME, "site-nav-user")
                logger.info("✓ 检测到登录状态")
                return True
            except:
                pass

            return False

        except Exception as e:
            logger.error(f"检查登录状态失败: {e}")
            return False


class TaobaoScraper:
    """淘宝爬虫 - 使用undetected-chromedriver绕过检测"""

    def __init__(self, headless=True, use_cookie=True):
        self.headless = headless
        self.use_cookie = use_cookie
        self.cookie_manager = CookieManager()
        self.driver = None
        self.results = []

    def setup_driver(self):
        """初始化undetected-chromedriver"""
        logger.info("初始化 undetected-chromedriver...")

        options = uc.ChromeOptions()

        if self.headless:
            options.add_argument('--headless=new')

        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')

        # 随机User-Agent
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        ]
        options.add_argument(f'user-agent={random.choice(user_agents)}')

        try:
            # 尝试使用系统chromium
            options.binary_location = '/data/data/com.termux/files/usr/bin/chromium-browser'
            self.driver = uc.Chrome(
                options=options,
                driver_executable_path='/data/data/com.termux/files/usr/bin/chromedriver',
                use_subprocess=True,
                version_main=140
            )
        except Exception as e:
            logger.warning(f"使用系统Chrome失败: {e}")
            logger.info("尝试自动下载chromedriver...")
            self.driver = uc.Chrome(options=options, use_subprocess=True)

        logger.info("✓ Driver初始化成功")
        return self.driver

    def login_if_needed(self):
        """如果需要则登录"""
        if self.use_cookie and self.cookie_manager.load_cookies(self.driver):
            if self.cookie_manager.is_logged_in(self.driver):
                logger.info("✓ 使用Cookie登录成功")
                return True

        logger.warning("⚠ 需要手动登录")
        logger.info("请在浏览器中完成登录...")

        if self.headless:
            logger.error("✗ Headless模式无法手动登录,请使用 --no-headless 参数")
            return False

        # 等待用户登录
        self.driver.get("https://login.taobao.com")

        logger.info("等待登录完成 (检测到登录后自动继续)...")

        # 最多等待5分钟
        for i in range(300):
            time.sleep(1)
            if self.cookie_manager.is_logged_in(self.driver):
                logger.info("✓ 登录成功!")
                self.cookie_manager.save_cookies(self.driver)
                return True

        logger.error("✗ 登录超时")
        return False

    def search_products(self, keyword, max_pages=1, sort='default'):
        """搜索商品"""
        logger.info(f"\n{'='*70}")
        logger.info(f"搜索关键词: {keyword}")
        logger.info(f"{'='*70}")

        results = []

        for page in range(1, max_pages + 1):
            logger.info(f"\n[页面 {page}/{max_pages}]")

            # 构建搜索URL
            start = (page - 1) * 44
            search_url = f"https://s.taobao.com/search?q={keyword}&s={start}"

            # 添加排序参数
            if sort == 'price_asc':
                search_url += "&sort=price-asc"
            elif sort == 'price_desc':
                search_url += "&sort=price-desc"
            elif sort == 'sales':
                search_url += "&sort=sale-desc"

            logger.info(f"访问: {search_url}")

            self.driver.get(search_url)

            # 随机等待避免检测
            wait_time = random.uniform(3, 6)
            logger.info(f"等待 {wait_time:.1f}秒...")
            time.sleep(wait_time)

            # 滚动页面触发懒加载
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            time.sleep(2)
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # 提取商品数据
            page_products = self.extract_products_from_page()

            if page_products:
                results.extend(page_products)
                logger.info(f"✓ 提取 {len(page_products)} 个商品")
            else:
                logger.warning(f"⚠ 未提取到商品数据")

                # 保存页面用于调试
                debug_file = os.path.join(OUTPUT_DIR, f"debug_{keyword}_page{page}.html")
                with open(debug_file, 'w', encoding='utf-8') as f:
                    f.write(self.driver.page_source)
                logger.info(f"已保存页面HTML: {debug_file}")

            # 翻页延迟
            if page < max_pages:
                delay = random.uniform(2, 4)
                logger.info(f"翻页延迟 {delay:.1f}秒...")
                time.sleep(delay)

        logger.info(f"\n✓ 搜索完成: {keyword}")
        logger.info(f"  总共提取: {len(results)} 个商品")

        return results

    def extract_products_from_page(self):
        """从当前页面提取商品数据"""
        products = []

        try:
            # 等待商品列表加载
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "items"))
            )

            # 尝试多种选择器策略
            selectors = [
                (By.CSS_SELECTOR, "div.items > div[data-category='auctions']"),
                (By.CSS_SELECTOR, "div.item"),
                (By.CSS_SELECTOR, "div.Card--doubleCard--"),
                (By.CLASS_NAME, "item"),
            ]

            items = []
            for selector_type, selector in selectors:
                try:
                    items = self.driver.find_elements(selector_type, selector)
                    if items:
                        logger.debug(f"使用选择器: {selector_type} = {selector}")
                        break
                except:
                    continue

            if not items:
                logger.warning("未找到商品元素")
                return products

            logger.info(f"找到 {len(items)} 个商品元素")

            # 提取每个商品信息
            for idx, item in enumerate(items[:50], 1):  # 限制最多50个
                try:
                    product = self.extract_product_info(item, idx)
                    if product and product.get('title'):
                        products.append(product)
                except Exception as e:
                    logger.debug(f"提取商品 {idx} 失败: {e}")

        except TimeoutException:
            logger.warning("页面加载超时")
        except Exception as e:
            logger.error(f"提取商品失败: {e}")

        return products

    def extract_product_info(self, element, index):
        """提取单个商品信息"""
        product = {}

        try:
            # 标题
            title_selectors = [
                (By.CSS_SELECTOR, "div.title > a"),
                (By.CSS_SELECTOR, "a.pic-link"),
                (By.CLASS_NAME, "title"),
            ]

            for sel_type, sel in title_selectors:
                try:
                    title_elem = element.find_element(sel_type, sel)
                    product['title'] = title_elem.get_attribute('title') or title_elem.text.strip()
                    if product['title']:
                        break
                except:
                    continue

            # 价格
            price_selectors = [
                (By.CSS_SELECTOR, "strong.price"),
                (By.CLASS_NAME, "price"),
                (By.CSS_SELECTOR, "div.price"),
            ]

            for sel_type, sel in price_selectors:
                try:
                    price_elem = element.find_element(sel_type, sel)
                    price_text = price_elem.text.strip()
                    # 提取数字
                    price_match = re.search(r'(\d+\.?\d*)', price_text)
                    if price_match:
                        product['price'] = price_match.group(1)
                        break
                except:
                    continue

            # 销量
            try:
                sales_elem = element.find_element(By.CLASS_NAME, "deal-cnt")
                product['sales'] = sales_elem.text.strip()
            except:
                product['sales'] = ''

            # 店铺
            try:
                shop_elem = element.find_element(By.CLASS_NAME, "shop")
                product['shop'] = shop_elem.text.strip()
            except:
                product['shop'] = ''

            # 商品链接
            try:
                link_elem = element.find_element(By.CSS_SELECTOR, "a.pic-link")
                product['url'] = link_elem.get_attribute('href')
            except:
                product['url'] = ''

            # 商品ID
            if product.get('url'):
                id_match = re.search(r'id=(\d+)', product['url'])
                if id_match:
                    product['item_id'] = id_match.group(1)

            product['index'] = index
            product['scraped_at'] = datetime.now().isoformat()

        except Exception as e:
            logger.debug(f"提取商品信息异常: {e}")

        return product

    def save_results(self, results, keyword, output_format='json'):
        """保存结果"""
        if not results:
            logger.warning("没有数据可保存")
            return

        timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')

        if output_format == 'json':
            filename = os.path.join(OUTPUT_DIR, f"{keyword}_{timestamp_str}.json")
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            logger.info(f"✓ JSON已保存: {filename}")

        elif output_format == 'csv':
            import csv
            filename = os.path.join(OUTPUT_DIR, f"{keyword}_{timestamp_str}.csv")

            if results:
                keys = results[0].keys()
                with open(filename, 'w', encoding='utf-8-sig', newline='') as f:
                    writer = csv.DictWriter(f, fieldnames=keys)
                    writer.writeheader()
                    writer.writerows(results)
                logger.info(f"✓ CSV已保存: {filename}")

        return filename

    def close(self):
        """关闭浏览器"""
        if self.driver:
            self.driver.quit()
            logger.info("✓ 浏览器已关闭")


def parse_args():
    """解析命令行参数"""
    parser = argparse.ArgumentParser(
        description='淘宝商品爬虫 - 支持自定义规则采集',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
示例:
  # 搜索单个关键词
  python %(prog)s -k 牙刷

  # 搜索多个关键词
  python %(prog)s -k 牙刷 牙膏 洗发水

  # 从文件读取关键词
  python %(prog)s -f daily_goods_100.txt

  # 指定页数和排序
  python %(prog)s -k 手机 -p 3 --sort sales

  # 保存为CSV格式
  python %(prog)s -k 笔记本电脑 --format csv

  # 不使用Cookie (首次登录)
  python %(prog)s -k 衣服 --no-cookie --no-headless
        '''
    )

    parser.add_argument('-k', '--keywords', nargs='+', help='搜索关键词(可多个)')
    parser.add_argument('-f', '--file', help='关键词文件(每行一个)')
    parser.add_argument('-p', '--pages', type=int, default=1, help='每个关键词爬取页数 (默认: 1)')
    parser.add_argument('--sort', choices=['default', 'price_asc', 'price_desc', 'sales'],
                        default='default', help='排序方式 (默认: default)')
    parser.add_argument('--format', choices=['json', 'csv'], default='json',
                        help='输出格式 (默认: json)')
    parser.add_argument('--no-headless', action='store_true', help='显示浏览器窗口')
    parser.add_argument('--no-cookie', action='store_true', help='不使用Cookie (需手动登录)')
    parser.add_argument('--delay', type=float, default=3.0, help='请求延迟秒数 (默认: 3)')

    return parser.parse_args()


def main():
    """主函数"""
    args = parse_args()

    # 获取关键词列表
    keywords = []

    if args.keywords:
        keywords.extend(args.keywords)

    if args.file:
        if os.path.exists(args.file):
            with open(args.file, 'r', encoding='utf-8') as f:
                keywords.extend([line.strip() for line in f if line.strip()])
        else:
            logger.error(f"文件不存在: {args.file}")
            return

    if not keywords:
        logger.error("请提供关键词 (-k) 或关键词文件 (-f)")
        logger.info("使用 -h 查看帮助")
        return

    logger.info("="*70)
    logger.info("淘宝终极爬虫")
    logger.info("="*70)
    logger.info(f"关键词数量: {len(keywords)}")
    logger.info(f"每个关键词页数: {args.pages}")
    logger.info(f"排序方式: {args.sort}")
    logger.info(f"输出格式: {args.format}")
    logger.info(f"Headless模式: {not args.no_headless}")
    logger.info(f"使用Cookie: {not args.no_cookie}")
    logger.info("="*70)

    # 初始化爬虫
    scraper = TaobaoScraper(
        headless=not args.no_headless,
        use_cookie=not args.no_cookie
    )

    try:
        scraper.setup_driver()

        # 登录检查
        if not args.no_cookie:
            if not scraper.login_if_needed():
                logger.error("登录失败,请使用 --no-cookie --no-headless 手动登录")
                return

        # 爬取每个关键词
        all_results = {}

        for idx, keyword in enumerate(keywords, 1):
            logger.info(f"\n[{idx}/{len(keywords)}] 处理关键词: {keyword}")

            results = scraper.search_products(
                keyword=keyword,
                max_pages=args.pages,
                sort=args.sort
            )

            if results:
                # 保存结果
                filename = scraper.save_results(results, keyword, args.format)
                all_results[keyword] = {
                    'count': len(results),
                    'file': filename
                }

            # 关键词间延迟
            if idx < len(keywords):
                delay = random.uniform(args.delay, args.delay + 2)
                logger.info(f"关键词间延迟 {delay:.1f}秒...")
                time.sleep(delay)

        # 汇总统计
        logger.info("\n" + "="*70)
        logger.info("爬取完成!")
        logger.info("="*70)

        for keyword, info in all_results.items():
            logger.info(f"{keyword}: {info['count']} 个商品 -> {info['file']}")

        logger.info(f"\n总计: {sum(info['count'] for info in all_results.values())} 个商品")
        logger.info(f"日志文件: {log_file}")
        logger.info("="*70)

    except KeyboardInterrupt:
        logger.warning("\n用户中断")
    except Exception as e:
        logger.error(f"执行出错: {e}", exc_info=True)
    finally:
        scraper.close()


if __name__ == "__main__":
    main()
