#!/usr/bin/env python3
"""
淘宝高级爬虫 - 提取JavaScript中的数据
"""

import json
import os
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup

OUTPUT_DIR = "taobao_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def log(message, level="INFO"):
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

def extract_json_from_scripts(html):
    """从script标签中提取JSON数据"""
    log("从JavaScript中提取数据...")

    soup = BeautifulSoup(html, 'html.parser')
    scripts = soup.find_all('script')

    all_json_data = []
    patterns = [
        r'g_page_config\s*=\s*({.*?});',
        r'window\.g_config\s*=\s*({.*?});',
        r'__decodeQ\s*=\s*[\'"](.+?)[\'"]',
        r'window\.__INITIAL_STATE__\s*=\s*({.*?});',
        r'g_srp_loadCss\s*=\s*({.*?});',
    ]

    for idx, script in enumerate(scripts):
        if not script.string:
            continue

        script_text = script.string

        # 尝试各种模式
        for pattern in patterns:
            matches = re.findall(pattern, script_text, re.DOTALL)
            if matches:
                log(f"  在Script {idx}中找到匹配模式: {pattern[:30]}...")
                for match in matches:
                    try:
                        # 尝试解析JSON
                        if match.startswith('{'):
                            data = json.loads(match)
                            all_json_data.append({
                                'script_index': idx,
                                'pattern': pattern,
                                'data': data
                            })
                            log(f"    ✓ 成功解析JSON数据")
                        else:
                            log(f"    数据: {match[:100]}")
                            all_json_data.append({
                                'script_index': idx,
                                'pattern': pattern,
                                'value': match
                            })
                    except json.JSONDecodeError as e:
                        log(f"    ✗ JSON解析失败: {e}", "WARNING")
                        # 保存原始内容用于手动分析
                        all_json_data.append({
                            'script_index': idx,
                            'pattern': pattern,
                            'raw': match[:500]
                        })

        # 保存包含"data"或"items"的脚本
        if any(keyword in script_text for keyword in ['"data"', '"items"', '"auctions"', '"shopItems"']):
            log(f"  Script {idx} 可能包含商品数据")

    return all_json_data

def fetch_and_analyze(url):
    """获取并分析页面"""
    log("="*70)
    log("淘宝高级爬虫 - JavaScript数据提取")
    log("="*70)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Referer': 'https://www.taobao.com/',
    }

    log(f"请求URL: {url[:80]}...")

    try:
        response = requests.get(url, headers=headers, timeout=30)
        log(f"✓ 状态码: {response.status_code}")

        if response.status_code != 200:
            log(f"✗ 请求失败，状态码: {response.status_code}", "ERROR")
            return

        html = response.text
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # 保存HTML
        html_file = os.path.join(OUTPUT_DIR, f"page_{timestamp}.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(html)
        log(f"✓ HTML已保存: {html_file}")

        # 分析页面
        log("\n页面分析:")
        log(f"  HTML大小: {len(html):,} 字节")

        soup = BeautifulSoup(html, 'html.parser')
        log(f"  标题: {soup.title.string if soup.title else 'N/A'}")

        # 查找关键信息
        decode_q = re.search(r'__decodeQ\s*=\s*[\'"](.+?)[\'"]', html)
        if decode_q:
            log(f"  搜索关键词: {decode_q.group(1)}")

        # 提取JavaScript数据
        json_data = extract_json_from_scripts(html)

        if json_data:
            log(f"\n✓ 提取到 {len(json_data)} 个数据片段")

            # 保存提取的数据
            data_file = os.path.join(OUTPUT_DIR, f"extracted_data_{timestamp}.json")
            with open(data_file, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=2)
            log(f"✓ 数据已保存: {data_file}")

            # 显示摘要
            log("\n数据摘要:")
            for item in json_data[:5]:
                if 'data' in item:
                    log(f"  - 找到结构化数据 (Script {item['script_index']})")
                    # 尝试显示关键字段
                    data = item['data']
                    if isinstance(data, dict):
                        keys = list(data.keys())[:10]
                        log(f"    键: {keys}")
                elif 'value' in item:
                    log(f"  - 值: {item['value']}")

        # 分析所有script标签的内容
        log("\n分析所有Script标签...")
        scripts = soup.find_all('script')
        script_stats = []

        for idx, script in enumerate(scripts):
            if script.get('src'):
                script_stats.append({
                    'index': idx,
                    'type': 'external',
                    'src': script.get('src')
                })
            elif script.string:
                size = len(script.string)
                script_stats.append({
                    'index': idx,
                    'type': 'inline',
                    'size': size,
                    'preview': script.string[:100]
                })

        log(f"  找到 {len(script_stats)} 个script标签")
        log(f"  外部脚本: {len([s for s in script_stats if s['type'] == 'external'])}")
        log(f"  内联脚本: {len([s for s in script_stats if s['type'] == 'inline'])}")

        # 保存script统计
        script_file = os.path.join(OUTPUT_DIR, f"scripts_info_{timestamp}.json")
        with open(script_file, 'w', encoding='utf-8') as f:
            json.dump(script_stats, f, ensure_ascii=False, indent=2)
        log(f"✓ Script信息已保存: {script_file}")

        # 生成报告
        report = {
            'timestamp': timestamp,
            'url': url,
            'status_code': response.status_code,
            'html_size': len(html),
            'scripts_count': len(scripts),
            'json_fragments': len(json_data),
            'search_keyword': decode_q.group(1) if decode_q else None
        }

        report_file = os.path.join(OUTPUT_DIR, f"report_{timestamp}.json")
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)
        log(f"✓ 报告已保存: {report_file}")

    except Exception as e:
        log(f"✗ 错误: {e}", "ERROR")
        import traceback
        log(traceback.format_exc(), "ERROR")

    log("\n" + "="*70)
    log(f"完成! 所有文件保存在: {OUTPUT_DIR}/")
    log("="*70)

    # 提示下一步
    log("\n💡 下一步建议:")
    log("1. 检查 extracted_data_*.json 查看提取的数据")
    log("2. 淘宝使用React/Vue渲染，真实数据可能在:")
    log("   - API接口响应中（需要抓包分析）")
    log("   - 动态加载的JS文件中")
    log("3. 使用浏览器开发者工具查看Network标签，找到数据API")

def main():
    url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=13&ie=utf8&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

    fetch_and_analyze(url)

if __name__ == "__main__":
    main()
