#!/usr/bin/env python3
"""
API逆向工程工具
从已保存的HTML和网络日志中提取API结构
"""

import json
import re
import os
from urllib.parse import urlparse, parse_qs
from datetime import datetime

OUTPUT_DIR = "api_reverse"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def log(message, level="INFO"):
    timestamp = datetime.now().strftime('%H:%M:%S')
    print(f"[{timestamp}] [{level}] {message}")

def extract_api_from_html(html_file):
    """从HTML中提取嵌入的API数据"""
    log(f"分析HTML文件: {html_file}")

    with open(html_file, 'r', encoding='utf-8') as f:
        html = f.read()

    apis_found = []

    # 策略1: 查找script标签中的JSON数据
    script_pattern = r'<script[^>]*>(.*?)</script>'
    scripts = re.findall(script_pattern, html, re.DOTALL)

    log(f"  找到 {len(scripts)} 个script标签")

    for idx, script in enumerate(scripts):
        # 查找g_page_config, window.__INITIAL_STATE__等
        patterns = [
            r'g_page_config\s*=\s*({.*?});',
            r'window\.__INITIAL_STATE__\s*=\s*({.*?});',
            r'window\.g_config\s*=\s*({.*?});',
            r'"itemsArray"\s*:\s*(\[.*?\])',
            r'"data"\s*:\s*({.*?})',
        ]

        for pattern in patterns:
            matches = re.findall(pattern, script, re.DOTALL)
            if matches:
                for match in matches:
                    try:
                        data = json.loads(match)
                        apis_found.append({
                            'type': 'embedded_json',
                            'pattern': pattern[:30],
                            'data': data
                        })
                        log(f"    ✓ 找到嵌入的JSON数据")
                    except:
                        pass

    # 策略2: 查找URL模式
    url_pattern = r'https?://[^\s"\'<>]+(?:api|search|h5api|mtop)[^\s"\'<>]*'
    urls = re.findall(url_pattern, html)

    unique_urls = list(set(urls))
    log(f"  找到 {len(unique_urls)} 个可能的API URL")

    for url in unique_urls[:10]:
        apis_found.append({
            'type': 'url_pattern',
            'url': url
        })

    return apis_found

def analyze_api_url(url):
    """分析API URL结构"""
    parsed = urlparse(url)
    params = parse_qs(parsed.query)

    analysis = {
        'scheme': parsed.scheme,
        'domain': parsed.netloc,
        'path': parsed.path,
        'params': params,
        'base_url': f"{parsed.scheme}://{parsed.netloc}{parsed.path}",
        'query_string': parsed.query
    }

    return analysis

def reverse_engineer_apis():
    """逆向工程所有保存的数据"""
    log("="*70)
    log("API逆向工程")
    log("="*70)

    # 查找所有HTML文件
    html_dirs = [
        'taobao_batch_results',
        'taobao_smart_results',
        'taobao_selenium_output',
        'debug_output'
    ]

    all_apis = []

    for dir_name in html_dirs:
        if not os.path.exists(dir_name):
            continue

        log(f"\n检查目录: {dir_name}")
        html_files = [f for f in os.listdir(dir_name) if f.endswith('.html')]

        log(f"  找到 {len(html_files)} 个HTML文件")

        for html_file in html_files[:3]:  # 限制分析前3个
            filepath = os.path.join(dir_name, html_file)
            apis = extract_api_from_html(filepath)
            all_apis.extend(apis)

    # 保存发现的API
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    result_file = os.path.join(OUTPUT_DIR, f"discovered_apis_{timestamp}.json")

    with open(result_file, 'w', encoding='utf-8') as f:
        json.dump(all_apis, f, ensure_ascii=False, indent=2)

    log(f"\n✓ 发现 {len(all_apis)} 个API相关数据")
    log(f"✓ 保存到: {result_file}")

    # 分析URL结构
    url_apis = [api for api in all_apis if api['type'] == 'url_pattern']

    if url_apis:
        log(f"\nURL模式分析:")
        for idx, api in enumerate(url_apis[:10], 1):
            analysis = analyze_api_url(api['url'])
            log(f"\n  [{idx}] {analysis['base_url']}")
            log(f"      域名: {analysis['domain']}")
            log(f"      路径: {analysis['path']}")
            if analysis['params']:
                log(f"      参数: {list(analysis['params'].keys())[:5]}")

    return all_apis

def create_api_request_template():
    """创建API请求模板"""
    log("\n" + "="*70)
    log("生成API请求模板")
    log("="*70)

    template = """
# 淘宝API请求模板

## 基于分析的可能API端点

### 方案1: 搜索API（推测）
```python
import requests

url = "https://s.taobao.com/api/search/itemsearch"  # 需要验证

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Referer': 'https://s.taobao.com/',
    'Accept': 'application/json',
}

params = {
    'q': '牙刷',           # 搜索关键词
    's': 0,               # 起始位置
    'sort': 'default',    # 排序方式
}

# 可能需要Cookie
cookies = {
    '_tb_token_': 'xxx',  # 从浏览器获取
}

response = requests.get(url, headers=headers, params=params, cookies=cookies)
data = response.json()
```

### 方案2: MTOP API（淘宝常用）
```python
# 淘宝MTOP API格式
url = "https://h5api.m.taobao.com/h5/mtop.taobao.xxx"

headers = {
    'User-Agent': 'Mozilla/5.0...',
}

# MTOP需要签名
params = {
    'appKey': 'xxxx',
    'data': '{"keyword":"牙刷"}',
    't': timestamp,
    'sign': sign_value,  # 需要算法生成
}
```

### 方案3: 移动端H5 API
```python
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend"

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 10; ...)',
}
```

## 下一步

1. **使用api_interceptor.py捕获真实API**
   ```bash
   python api_interceptor.py 牙刷
   ```

2. **分析捕获的请求**
   - 查看api_captured/目录
   - 找到返回JSON的API
   - 复制完整的headers和params

3. **测试API请求**
   ```python
   # 使用捕获的真实URL测试
   response = requests.get(captured_url, headers=captured_headers)
   print(response.json())
   ```
"""

    template_file = os.path.join(OUTPUT_DIR, "api_request_template.md")
    with open(template_file, 'w', encoding='utf-8') as f:
        f.write(template)

    log(f"✓ 模板已保存: {template_file}")

def main():
    reverse_engineer_apis()
    create_api_request_template()

    log("\n提示: 运行 python api_interceptor.py 来捕获真实API请求")

if __name__ == "__main__":
    main()
