#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
1688汽车用品商品数据提取器
使用Selenium模拟浏览器访问动态页面并提取商品信息
"""

import json
import time
import csv
import os
import tempfile
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager


class Product1688Scraper:
    def __init__(self, headless=True):
        """初始化爬虫"""
        self.url = "https://show.1688.com/mtb/cht/eu1w74ug.html?__pageId__=222095&cms_id=222095&wh_pha=true"
        self.products = []
        self.headless = headless
        self.driver = None
        self.temp_profile_dir = None

    def setup_driver(self):
        """配置Chrome浏览器"""
        import shutil

        # 创建一个完全独立的临时配置目录
        self.temp_profile_dir = tempfile.mkdtemp(prefix='chrome_profile_')
        print(f"使用临时配置目录: {self.temp_profile_dir}")

        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument('--headless=new')

        # 核心配置 - 使用完全独立的配置目录
        chrome_options.add_argument(f'--user-data-dir={self.temp_profile_dir}')

        # 安全和性能配置
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-software-rasterizer')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

        # 使用webdriver-manager自动管理ChromeDriver
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        if not self.headless:
            try:
                self.driver.maximize_window()
            except:
                pass  # 在headless模式下maximize可能会失败

    def scroll_page(self):
        """滚动页面以加载所有动态内容"""
        print("正在滚动页面加载所有商品...")
        last_height = self.driver.execute_script("return document.body.scrollHeight")

        while True:
            # 向下滚动
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # 计算新的滚动高度
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def extract_products(self):
        """提取商品数据"""
        print("正在提取商品数据...")

        # 常见的商品选择器模式（根据实际页面结构调整）
        selectors = [
            "div.item",
            "div.product-item",
            "div[class*='card']",
            "div[class*='goods']",
            "div[class*='product']",
            "a[class*='item']",
        ]

        product_elements = []
        for selector in selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if len(elements) > 5:  # 找到多个商品元素
                    product_elements = elements
                    print(f"使用选择器 '{selector}' 找到 {len(elements)} 个商品")
                    break
            except:
                continue

        if not product_elements:
            print("未找到商品元素，尝试通过链接提取...")
            # 备用方案：查找所有包含商品链接的元素
            product_elements = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='offer'], a[href*='detail']")

        # 提取每个商品的信息
        for idx, element in enumerate(product_elements):
            try:
                product = self.extract_product_info(element, idx)
                if product and self.is_auto_product(product):
                    self.products.append(product)
            except Exception as e:
                print(f"提取第 {idx+1} 个商品时出错: {str(e)}")
                continue

        print(f"共提取到 {len(self.products)} 个汽车用品商品")

    def extract_product_info(self, element, idx):
        """从单个商品元素中提取信息"""
        product = {
            'index': idx + 1,
            'title': '',
            'price': '',
            'image': '',
            'link': '',
            'shop_name': '',
            'extract_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

        try:
            # 提取标题
            title_selectors = ['img[alt]', 'h3', 'h4', 'div[class*="title"]', 'span[class*="title"]']
            for sel in title_selectors:
                try:
                    title_elem = element.find_element(By.CSS_SELECTOR, sel)
                    if sel == 'img[alt]':
                        product['title'] = title_elem.get_attribute('alt')
                    else:
                        product['title'] = title_elem.text
                    if product['title']:
                        break
                except:
                    continue

            # 提取价格
            price_selectors = ['span[class*="price"]', 'div[class*="price"]', 'em[class*="price"]']
            for sel in price_selectors:
                try:
                    price_elem = element.find_element(By.CSS_SELECTOR, sel)
                    product['price'] = price_elem.text
                    if product['price']:
                        break
                except:
                    continue

            # 提取图片
            img_selectors = ['img']
            for sel in img_selectors:
                try:
                    img_elem = element.find_element(By.CSS_SELECTOR, sel)
                    product['image'] = img_elem.get_attribute('src') or img_elem.get_attribute('data-src')
                    if product['image']:
                        break
                except:
                    continue

            # 提取链接
            try:
                if element.tag_name == 'a':
                    product['link'] = element.get_attribute('href')
                else:
                    link_elem = element.find_element(By.CSS_SELECTOR, 'a')
                    product['link'] = link_elem.get_attribute('href')
            except:
                pass

            # 提取店铺名称
            shop_selectors = ['span[class*="shop"]', 'div[class*="shop"]', 'a[class*="shop"]']
            for sel in shop_selectors:
                try:
                    shop_elem = element.find_element(By.CSS_SELECTOR, sel)
                    product['shop_name'] = shop_elem.text
                    if product['shop_name']:
                        break
                except:
                    continue

        except Exception as e:
            print(f"解析商品详情时出错: {str(e)}")

        return product if product['title'] or product['link'] else None

    def is_auto_product(self, product):
        """判断是否为汽车用品"""
        auto_keywords = [
            '汽车', '车载', '车用', '轮胎', '机油', '坐垫', '脚垫',
            '香水', '挂件', '贴纸', '改装', '配件', '座套', '方向盘',
            '车灯', '雨刷', '后视镜', '行车记录仪', '充电器', '支架',
            '洗车', '打蜡', '车蜡', '清洁', '美容', '保养'
        ]

        title = product.get('title', '').lower()

        # 检查标题中是否包含汽车相关关键词
        for keyword in auto_keywords:
            if keyword in title:
                return True

        return False

    def save_to_json(self, filename='products_1688.json'):
        """保存为JSON格式"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.products, f, ensure_ascii=False, indent=2)
        print(f"数据已保存到 {filename}")

    def save_to_csv(self, filename='products_1688.csv'):
        """保存为CSV格式"""
        if not self.products:
            print("没有数据可保存")
            return

        with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['index', 'title', 'price', 'image', 'link', 'shop_name', 'extract_time']
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader()
            for product in self.products:
                writer.writerow(product)

        print(f"数据已保存到 {filename}")

    def run(self):
        """运行爬虫"""
        try:
            print("启动浏览器...")
            self.setup_driver()

            print(f"正在访问页面: {self.url}")
            self.driver.get(self.url)

            # 等待页面加载
            print("等待页面加载...")
            time.sleep(5)

            # 滚动加载所有内容
            self.scroll_page()

            # 额外等待动态内容加载
            time.sleep(3)

            # 提取商品
            self.extract_products()

            # 保存数据
            if self.products:
                self.save_to_json()
                self.save_to_csv()
            else:
                print("警告: 未提取到任何商品数据")
                # 保存页面源代码用于调试
                with open('page_source.html', 'w', encoding='utf-8') as f:
                    f.write(self.driver.page_source)
                print("页面源代码已保存到 page_source.html，请检查页面结构")

        except Exception as e:
            print(f"运行出错: {str(e)}")
            import traceback
            traceback.print_exc()

        finally:
            if self.driver:
                print("关闭浏览器...")
                self.driver.quit()

            # 清理临时配置目录
            if self.temp_profile_dir and os.path.exists(self.temp_profile_dir):
                try:
                    import shutil
                    shutil.rmtree(self.temp_profile_dir)
                except:
                    pass


def main():
    """主函数"""
    print("=" * 60)
    print("1688汽车用品商品数据提取器")
    print("=" * 60)

    # 创建爬虫实例（headless=False 可以看到浏览器运行过程）
    scraper = Product1688Scraper(headless=True)

    # 运行爬虫
    scraper.run()

    print("=" * 60)
    print("提取完成！")
    print("=" * 60)


if __name__ == '__main__':
    main()
