#!/usr/bin/env python3
"""
淘宝爬虫 - 简化调试模式
不依赖Selenium，使用requests直接获取页面
"""

import json
import os
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# 创建输出目录
OUTPUT_DIR = "debug_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def log(message, level="INFO"):
    """打印带时间戳的日志"""
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

def fetch_page(url):
    """获取页面内容"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
    }

    log(f"正在请求URL: {url[:80]}...")

    try:
        response = requests.get(url, headers=headers, timeout=30)
        log(f"✓ 响应状态码: {response.status_code}")
        log(f"✓ 响应大小: {len(response.content)} 字节")
        log(f"✓ 内容类型: {response.headers.get('Content-Type', 'Unknown')}")

        return response.text, response.status_code
    except Exception as e:
        log(f"✗ 请求失败: {e}", "ERROR")
        return None, None

def analyze_html(html):
    """分析HTML结构"""
    log("开始分析HTML结构...")

    # 保存原始HTML
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    html_file = os.path.join(OUTPUT_DIR, f"page_{timestamp}.html")
    with open(html_file, 'w', encoding='utf-8') as f:
        f.write(html)
    log(f"✓ 已保存HTML: {html_file}")

    soup = BeautifulSoup(html, 'html.parser')

    # 基本统计
    stats = {
        'title': soup.title.string if soup.title else 'N/A',
        'div_count': len(soup.find_all('div')),
        'a_count': len(soup.find_all('a')),
        'script_count': len(soup.find_all('script')),
        'img_count': len(soup.find_all('img')),
    }

    log("页面基本信息:")
    log(f"  标题: {stats['title']}")
    log(f"  <div>数量: {stats['div_count']}")
    log(f"  <a>链接数量: {stats['a_count']}")
    log(f"  <script>数量: {stats['script_count']}")
    log(f"  <img>图片数量: {stats['img_count']}")

    # 收集所有CSS类名
    all_classes = set()
    for tag in soup.find_all(class_=True):
        if isinstance(tag.get('class'), list):
            all_classes.update(tag.get('class'))

    log(f"\n找到 {len(all_classes)} 个不同的CSS类")

    # 查找可能的店铺/商品相关类
    keywords = ['shop', 'item', 'card', 'product', 'goods', 'store', 'list']
    relevant_classes = []

    for cls in all_classes:
        if any(keyword in cls.lower() for keyword in keywords):
            relevant_classes.append(cls)

    log(f"\n找到 {len(relevant_classes)} 个可能相关的CSS类:")
    for cls in sorted(relevant_classes)[:30]:
        log(f"  - {cls}")

    # 查找可能的数据容器
    log("\n查找可能的数据容器...")
    containers = []

    for cls in relevant_classes[:10]:
        elements = soup.find_all(class_=cls)
        if elements:
            log(f"  类 '{cls}': {len(elements)} 个元素")
            containers.append({'class': cls, 'count': len(elements)})

    # 检查是否有JSON数据
    log("\n检查页面中的JSON数据...")
    scripts = soup.find_all('script')
    json_data_found = []

    for idx, script in enumerate(scripts):
        if script.string and ('g_page_config' in script.string or 'window.g_config' in script.string or '"data"' in script.string):
            snippet = script.string[:200]
            log(f"  Script {idx}: 可能包含数据 - {snippet[:80]}...")
            json_data_found.append(idx)

            # 保存可能的JSON脚本
            if len(json_data_found) <= 3:
                script_file = os.path.join(OUTPUT_DIR, f"script_{timestamp}_{idx}.js")
                with open(script_file, 'w', encoding='utf-8') as f:
                    f.write(script.string if script.string else '')
                log(f"    已保存到: {script_file}")

    return {
        'stats': stats,
        'relevant_classes': relevant_classes,
        'containers': containers,
        'json_scripts': json_data_found,
        'timestamp': timestamp
    }

def extract_data(html):
    """尝试提取结构化数据"""
    log("\n尝试提取结构化数据...")
    soup = BeautifulSoup(html, 'html.parser')

    extracted = []

    # 策略1: 查找所有包含链接的div
    log("策略1: 查找包含链接的容器...")
    divs_with_links = soup.find_all('div', class_=lambda x: x and ('item' in str(x).lower() or 'shop' in str(x).lower()))

    for div in divs_with_links[:20]:
        links = div.find_all('a', href=True)
        if links:
            item = {
                'class': ' '.join(div.get('class', [])),
                'text': div.get_text(strip=True)[:100],
                'links': [link.get('href') for link in links[:3]],
                'html_snippet': str(div)[:200]
            }
            extracted.append(item)

    log(f"  找到 {len(extracted)} 个可能的项目")

    # 策略2: 直接查找所有链接
    if len(extracted) < 5:
        log("策略2: 分析所有链接...")
        all_links = soup.find_all('a', href=True)
        shop_links = [link for link in all_links if 'shop' in link.get('href', '').lower() or 'store' in link.get('href', '').lower()]

        log(f"  找到 {len(shop_links)} 个可能的店铺链接")

        for link in shop_links[:10]:
            item = {
                'text': link.get_text(strip=True),
                'url': link.get('href'),
                'parent_class': ' '.join(link.parent.get('class', [])) if link.parent else ''
            }
            extracted.append(item)

    return extracted

def main():
    log("="*70)
    log("淘宝爬虫 - 简化调试模式")
    log("="*70)

    url = "https://s.taobao.com/search?_input_charset=utf-8&commend=all&finalPage=13&ie=utf8&ie=utf8&initiative_id=tbindexz_20170306&preLoadOrigin=https%3A%2F%2Fwww.taobao.com&q=%E7%94%B5%E5%99%A8%E5%AE%B6%E7%94%A8%E5%A4%A7%E5%85%A8&search_type=shop&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.search_downSideRecommend.d1&ssid=s5-e&suggest=0_1&suggest_query=%E7%94%B5%E5%99%A8&tab=shop&wq=%E7%94%B5%E5%99%A8"

    # 获取页面
    html, status_code = fetch_page(url)

    if html:
        # 分析页面
        analysis = analyze_html(html)

        # 提取数据
        data = extract_data(html)

        # 保存提取的数据
        if data:
            data_file = os.path.join(OUTPUT_DIR, f"extracted_data_{analysis['timestamp']}.json")
            with open(data_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            log(f"\n✓ 提取的数据已保存: {data_file}")

            log("\n提取的数据示例:")
            for i, item in enumerate(data[:5], 1):
                log(f"\n  [{i}] {item.get('text', 'N/A')[:60]}")
                if 'url' in item:
                    log(f"      URL: {item['url'][:60]}")

        # 保存分析报告
        report_file = os.path.join(OUTPUT_DIR, f"analysis_report_{analysis['timestamp']}.json")
        report = {
            'url': url,
            'status_code': status_code,
            'analysis': analysis,
            'extracted_count': len(data),
            'sample_data': data[:3]
        }
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)
        log(f"\n✓ 分析报告已保存: {report_file}")

    log("\n" + "="*70)
    log(f"调试完成! 所有文件保存在: {OUTPUT_DIR}/")
    log("="*70)

    # 显示目录内容
    log("\n生成的文件:")
    for file in sorted(os.listdir(OUTPUT_DIR)):
        filepath = os.path.join(OUTPUT_DIR, file)
        size = os.path.getsize(filepath)
        log(f"  - {file} ({size:,} bytes)")

if __name__ == "__main__":
    main()
