#!/usr/bin/env python3
"""
淘宝店铺搜索爬虫 - 使用Selenium
安装: pip install selenium pandas beautifulsoup4
"""

import json
import time
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def setup_driver():
    """配置Chrome驱动"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36')

    try:
        driver = webdriver.Chrome(options=chrome_options)
        return driver
    except Exception as e:
        print(f"Chrome驱动初始化失败: {e}")
        print("请确保已安装chromedriver: pkg install chromium")
        return None

def scrape_page(driver, url, max_scrolls=3):
    """爬取单页数据"""
    print(f"正在访问: {url}")
    driver.get(url)

    # 等待页面加载
    time.sleep(5)

    # 滚动加载更多内容
    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    # 获取页面源代码
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # 提取数据
    shops = []

    # 尝试不同的选择器
    shop_elements = soup.find_all(['div', 'a'], class_=lambda x: x and ('shop' in x.lower() or 'item' in x.lower()))

    print(f"找到 {len(shop_elements)} 个可能的店铺元素")

    for elem in shop_elements[:50]:  # 限制最多50个
        try:
            shop = {
                'name': '',
                'url': '',
                'description': '',
                'location': '',
                'raw_html': str(elem)[:500]  # 保存前500字符用于调试
            }

            # 提取店铺名称
            name_elem = elem.find(['span', 'a', 'h3'], class_=lambda x: x and 'name' in x.lower())
            if name_elem:
                shop['name'] = name_elem.get_text(strip=True)

            # 提取链接
            link = elem.find('a')
            if link and link.get('href'):
                shop['url'] = link['href']
            elif elem.name == 'a' and elem.get('href'):
                shop['url'] = elem['href']

            # 提取描述
            desc_elem = elem.find(['span', 'div'], class_=lambda x: x and ('desc' in x.lower() or 'intro' in x.lower()))
            if desc_elem:
                shop['description'] = desc_elem.get_text(strip=True)

            if shop['name'] or shop['url']:
                shops.append(shop)
        except Exception as e:
            continue

    return shops

def main():
    url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=13&ie=utf8&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

    driver = setup_driver()
    if not driver:
        return

    try:
        shops = scrape_page(driver, url)

        print(f"\n总共爬取到 {len(shops)} 个店铺")

        if shops:
            # 保存JSON
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            json_file = f'taobao_shops_{timestamp}.json'
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(shops, f, ensure_ascii=False, indent=2)
            print(f"已保存JSON: {json_file}")

            # 保存CSV
            csv_file = f'taobao_shops_{timestamp}.csv'
            df = pd.DataFrame(shops)
            df.to_csv(csv_file, index=False, encoding='utf-8-sig')
            print(f"已保存CSV: {csv_file}")

            # 预览
            print("\n数据预览:")
            for i, shop in enumerate(shops[:5], 1):
                print(f"\n{i}. {shop.get('name', 'N/A')}")
                print(f"   URL: {shop.get('url', 'N/A')[:80]}")
        else:
            print("未爬取到数据")
            # 保存完整HTML用于调试
            with open('debug_page.html', 'w', encoding='utf-8') as f:
                f.write(driver.page_source)
            print("已保存页面HTML到debug_page.html用于调试")

    finally:
        driver.quit()

if __name__ == "__main__":
    main()