import os
import json
import time
import random
import requests
from bs4 import BeautifulSoup

# 创建目录
os.makedirs('results', exist_ok=True)
os.makedirs('json_data', exist_ok=True)

# 基础URL，复制参数
base_url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=4&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

with open('daily_goods.txt', 'r', encoding='utf-8') as f:
    queries = [line.strip() for line in f if line.strip()]

for query in queries[:10]:  # 测试前10个，避免过多
    try:
        url = base_url.replace('q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8', f'q={requests.utils.quote(query)}')
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        # 保存HTML
        with open(f'results/{query}.html', 'w', encoding='utf-8') as f:
            f.write(response.text)
        # 解析
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.find_all('div', class_='item')  # 淘宝店铺搜索的类名可能不同，需要检查实际HTML
        data = []
        for item in items[:5]:  # 取前5个
            title_elem = item.find('a', class_='title')
            price_elem = item.find('span', class_='price')
            title = title_elem.text.strip() if title_elem else 'N/A'
            price = price_elem.text.strip() if price_elem else 'N/A'
            data.append({'title': title, 'price': price})
        with open(f'json_data/{query}.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        time.sleep(random.uniform(1, 3))  # 随机延迟
    except Exception as e:
        print(f"Error with {query}: {e}")

# 分析
all_data = {}
for query in queries[:10]:
    try:
        with open(f'json_data/{query}.json', 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_data[query] = data
    except:
        all_data[query] = []

# 生成报告
report = {
    "total_queries": len(all_data),
    "average_items_per_query": sum(len(v) for v in all_data.values()) / len(all_data) if all_data else 0,
    "details": all_data
}

with open('analysis_report.json', 'w', encoding='utf-8') as f:
    json.dump(report, f, ensure_ascii=False, indent=4)

print("Scraping completed. Check results/ and json_data/ folders, and analysis_report.json")