#!/usr/bin/env python3
"""
淘宝店铺搜索结果爬虫 - 使用Playwright爬取动态网页
需要安装: pip install playwright beautifulsoup4 pandas
然后运行: playwright install chromium
"""

import asyncio
import json
import pandas as pd
from playwright.async_api import async_playwright
from datetime import datetime

async def scrape_taobao_shops(url, max_pages=3):
    """爬取淘宝店铺搜索结果"""

    all_shops = []

    async with async_playwright() as p:
        # 启动浏览器（无头模式）
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
        )
        page = await context.new_page()

        try:
            print(f"正在访问: {url}")
            await page.goto(url, wait_until='networkidle', timeout=60000)

            # 等待内容加载
            await page.wait_for_timeout(3000)

            for page_num in range(1, max_pages + 1):
                print(f"\n正在爬取第 {page_num} 页...")

                # 等待店铺列表加载
                await page.wait_for_timeout(2000)

                # 提取店铺数据
                shops = await page.evaluate('''() => {
                    const results = [];
                    // 尝试多种选择器
                    const shopItems = document.querySelectorAll('.shop-item, .ShopItem, [class*="shop"]');

                    shopItems.forEach(item => {
                        try {
                            const shop = {
                                name: item.querySelector('.shop-name, .shopname, [class*="shopName"]')?.textContent?.trim() || '',
                                url: item.querySelector('a')?.href || '',
                                description: item.querySelector('.shop-desc, [class*="desc"]')?.textContent?.trim() || '',
                                location: item.querySelector('.location, [class*="location"]')?.textContent?.trim() || '',
                                fans: item.querySelector('.fans, [class*="fans"]')?.textContent?.trim() || '',
                                rating: item.querySelector('.rating, [class*="rating"]')?.textContent?.trim() || ''
                            };
                            if (shop.name || shop.url) {
                                results.push(shop);
                            }
                        } catch (e) {
                            console.error('提取单个店铺数据失败:', e);
                        }
                    });

                    return results;
                }''')

                print(f"第 {page_num} 页找到 {len(shops)} 个店铺")
                all_shops.extend(shops)

                # 尝试点击下一页
                if page_num < max_pages:
                    try:
                        next_button = await page.query_selector('.next, [class*="next"], .pagination-next')
                        if next_button:
                            await next_button.click()
                            await page.wait_for_timeout(3000)
                        else:
                            print("未找到下一页按钮")
                            break
                    except Exception as e:
                        print(f"翻页失败: {e}")
                        break

        except Exception as e:
            print(f"爬取过程出错: {e}")

        finally:
            await browser.close()

    return all_shops

async def main():
    # 目标URL
    url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=13&ie=utf8&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

    # 开始爬取（爬取3页）
    shops = await scrape_taobao_shops(url, max_pages=3)

    print(f"\n总共爬取到 {len(shops)} 个店铺")

    if shops:
        # 保存为JSON
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        json_file = f'taobao_shops_{timestamp}.json'
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(shops, f, ensure_ascii=False, indent=2)
        print(f"已保存JSON文件: {json_file}")

        # 保存为CSV
        csv_file = f'taobao_shops_{timestamp}.csv'
        df = pd.DataFrame(shops)
        df.to_csv(csv_file, index=False, encoding='utf-8-sig')
        print(f"已保存CSV文件: {csv_file}")

        # 显示前几条数据
        print("\n前5条数据预览:")
        print(df.head())
    else:
        print("未爬取到任何数据，可能需要调整选择器或处理反爬虫机制")

if __name__ == "__main__":
    asyncio.run(main())
