#!/usr/bin/env python3
"""
淘宝API批量爬取器
使用捕获的API结构递归请求103个商品数据
"""

import requests
import json
import time
import random
import os
import logging
from datetime import datetime
from urllib.parse import quote, unquote
import re

# 配置
OUTPUT_DIR = "batch_api_results"
LOG_DIR = "batch_api_logs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# 配置日志
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = os.path.join(LOG_DIR, f"batch_scraping_{timestamp}.log")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

# User-Agent池
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]

class TaobaoAPIBatchScraper:
    def __init__(self):
        self.session = requests.Session()
        self.results = []
        self.failed_keywords = []
        self.captured_apis = []
        self.load_captured_apis()

    def load_captured_apis(self):
        """加载捕获的API请求"""
        api_file = "api_captured/api_requests_牙刷_20251010_135245.json"

        if os.path.exists(api_file):
            with open(api_file, 'r', encoding='utf-8') as f:
                self.captured_apis = json.load(f)
            logger.info(f"✓ 加载了 {len(self.captured_apis)} 个捕获的API请求")
        else:
            logger.warning(f"⚠ API文件不存在: {api_file}")

    def get_random_headers(self):
        """获取随机请求头"""
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Referer': 'https://s.taobao.com/',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-site',
        }

    def parse_jsonp(self, text):
        """解析JSONP响应"""
        try:
            # 尝试直接解析JSON
            return json.loads(text)
        except:
            # 处理JSONP格式: mtopjsonp1({"data": ...})
            match = re.search(r'[a-zA-Z0-9_]+\((.*)\)', text, re.DOTALL)
            if match:
                json_str = match.group(1)
                return json.loads(json_str)
            return None

    def request_with_captured_url(self, url, keyword):
        """使用捕获的URL请求(替换关键词)"""
        try:
            # 替换URL中的关键词
            # 淘宝搜索URL格式: q=%E7%89%99%E5%88%B7 (URL编码的关键词)
            encoded_keyword = quote(keyword)

            # 简单替换策略(可能不适用所有API)
            if '%E7%89%99%E5%88%B7' in url:  # 原关键词"牙刷"的编码
                url = url.replace('%E7%89%99%E5%88%B7', encoded_keyword)
            elif 'q=' in url:
                url = re.sub(r'q=[^&]*', f'q={encoded_keyword}', url)

            headers = self.get_random_headers()

            logger.info(f"  请求URL: {url[:100]}...")

            response = self.session.get(url, headers=headers, timeout=15)

            logger.info(f"  状态码: {response.status_code}")

            if response.status_code == 200:
                # 尝试解析响应
                content_type = response.headers.get('Content-Type', '')

                if 'json' in content_type or 'javascript' in content_type:
                    data = self.parse_jsonp(response.text)
                    if data:
                        logger.info(f"  ✓ 成功解析数据")
                        return {'success': True, 'data': data, 'url': url}
                    else:
                        logger.warning(f"  ⚠ 无法解析JSON/JSONP")
                        return {'success': False, 'error': 'Parse error', 'text': response.text[:200]}
                else:
                    logger.warning(f"  ⚠ 非JSON响应: {content_type}")
                    return {'success': False, 'error': f'Wrong content type: {content_type}'}
            else:
                logger.error(f"  ✗ 请求失败: {response.status_code}")
                return {'success': False, 'error': f'HTTP {response.status_code}'}

        except Exception as e:
            logger.error(f"  ✗ 异常: {e}")
            return {'success': False, 'error': str(e)}

    def try_multiple_api_endpoints(self, keyword):
        """尝试多个API端点"""
        logger.info(f"\n处理关键词: {keyword}")

        # 过滤有效的API端点(JSON类型)
        json_apis = [
            api for api in self.captured_apis
            if 'json' in api.get('mimeType', '').lower()
            and api.get('status') == 200
        ]

        # 优先尝试可能包含商品数据的API
        priority_patterns = [
            'search',  # 搜索API
            'item',    # 商品API
            'product', # 产品API
            'goods',   # 货物API
        ]

        # 按优先级排序
        sorted_apis = []
        for pattern in priority_patterns:
            for api in json_apis:
                if pattern in api['url'].lower() and api not in sorted_apis:
                    sorted_apis.append(api)

        # 添加剩余API
        for api in json_apis:
            if api not in sorted_apis:
                sorted_apis.append(api)

        logger.info(f"  尝试 {len(sorted_apis)} 个API端点")

        # 尝试每个API
        for idx, api in enumerate(sorted_apis[:5], 1):  # 最多尝试5个
            logger.info(f"  [{idx}] 尝试API: {api['url'][:80]}...")

            result = self.request_with_captured_url(api['url'], keyword)

            if result.get('success'):
                # 检查是否包含商品数据
                data = result.get('data', {})
                if self.has_product_data(data):
                    logger.info(f"  ✓ 找到商品数据!")
                    return result
                else:
                    logger.info(f"  ⚠ 响应成功但不包含商品数据")

            # 延迟避免频繁请求
            time.sleep(random.uniform(1, 2))

        logger.warning(f"  ✗ 所有API端点均未获取到商品数据")
        return {'success': False, 'error': 'No product data found'}

    def has_product_data(self, data):
        """检查响应是否包含商品数据"""
        if not isinstance(data, dict):
            return False

        # 常见的商品数据字段
        product_indicators = [
            'items', 'itemsArray', 'auctions', 'mainItems',
            'itemList', 'productList', 'goods', 'products'
        ]

        def check_nested(obj, depth=0):
            if depth > 5:  # 最多检查5层
                return False

            if isinstance(obj, dict):
                for key in obj.keys():
                    if any(indicator in key.lower() for indicator in product_indicators):
                        value = obj[key]
                        if isinstance(value, list) and len(value) > 0:
                            logger.info(f"    发现商品字段: {key} (包含 {len(value)} 项)")
                            return True

                # 递归检查嵌套
                for value in obj.values():
                    if check_nested(value, depth + 1):
                        return True

            elif isinstance(obj, list) and len(obj) > 0:
                return check_nested(obj[0], depth + 1)

            return False

        return check_nested(data)

    def extract_products(self, data):
        """从响应中提取商品数据"""
        products = []

        def find_product_list(obj, depth=0):
            if depth > 5:
                return []

            if isinstance(obj, dict):
                # 检查常见字段
                for key in ['items', 'itemsArray', 'auctions', 'mainItems', 'itemList']:
                    if key in obj and isinstance(obj[key], list):
                        return obj[key]

                # 递归搜索
                for value in obj.values():
                    result = find_product_list(value, depth + 1)
                    if result:
                        return result

            elif isinstance(obj, list) and len(obj) > 0:
                # 检查是否是商品列表
                if isinstance(obj[0], dict) and any(k in obj[0] for k in ['title', 'item_id', 'nid', 'raw_title']):
                    return obj

            return []

        product_list = find_product_list(data)

        if product_list:
            logger.info(f"    提取到 {len(product_list)} 个商品")

            for item in product_list:
                try:
                    product = {
                        'title': item.get('title') or item.get('raw_title') or item.get('item_title', ''),
                        'price': item.get('price') or item.get('view_price') or item.get('reserve_price', ''),
                        'sales': item.get('view_sales') or item.get('sold', ''),
                        'item_id': item.get('item_id') or item.get('nid') or item.get('id', ''),
                        'shop_name': item.get('nick') or item.get('shop_name', ''),
                        'location': item.get('item_loc') or item.get('location', ''),
                        'pic_url': item.get('pic_url') or item.get('pict_url', ''),
                    }

                    # 只保留有标题的商品
                    if product['title']:
                        products.append(product)

                except Exception as e:
                    logger.warning(f"    提取商品失败: {e}")

        return products

    def scrape_keyword(self, keyword):
        """爬取单个关键词"""
        try:
            result = self.try_multiple_api_endpoints(keyword)

            if result.get('success'):
                # 提取商品数据
                products = self.extract_products(result.get('data', {}))

                if products:
                    keyword_result = {
                        'keyword': keyword,
                        'success': True,
                        'product_count': len(products),
                        'products': products,
                        'api_url': result.get('url', ''),
                        'timestamp': datetime.now().isoformat()
                    }

                    logger.info(f"✓ 成功: {keyword} - 提取 {len(products)} 个商品")
                    return keyword_result
                else:
                    logger.warning(f"⚠ {keyword} - API成功但未提取到商品")

            # 失败情况
            self.failed_keywords.append({
                'keyword': keyword,
                'error': result.get('error', 'Unknown'),
                'timestamp': datetime.now().isoformat()
            })

            logger.error(f"✗ 失败: {keyword} - {result.get('error')}")
            return None

        except Exception as e:
            logger.error(f"✗ 异常: {keyword} - {e}")
            self.failed_keywords.append({
                'keyword': keyword,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            })
            return None

    def batch_scrape(self, keywords_file='daily_goods_100.txt'):
        """批量爬取"""
        logger.info("="*70)
        logger.info("淘宝API批量爬取器")
        logger.info("="*70)

        # 读取关键词列表
        if not os.path.exists(keywords_file):
            logger.error(f"关键词文件不存在: {keywords_file}")
            return

        with open(keywords_file, 'r', encoding='utf-8') as f:
            keywords = [line.strip() for line in f if line.strip()]

        logger.info(f"\n加载了 {len(keywords)} 个关键词")
        logger.info(f"日志文件: {log_file}\n")

        start_time = time.time()

        # 逐个爬取
        for idx, keyword in enumerate(keywords, 1):
            logger.info(f"\n[{idx}/{len(keywords)}] 处理: {keyword}")

            result = self.scrape_keyword(keyword)

            if result:
                self.results.append(result)

            # 随机延迟(避免被封)
            delay = random.uniform(2, 5)
            logger.info(f"  延迟 {delay:.1f}秒...")
            time.sleep(delay)

            # 每10个保存一次
            if idx % 10 == 0:
                self.save_progress()

        # 最终保存
        self.save_final_results()

        # 统计
        elapsed = time.time() - start_time
        logger.info("\n" + "="*70)
        logger.info("爬取完成!")
        logger.info("="*70)
        logger.info(f"总关键词: {len(keywords)}")
        logger.info(f"成功: {len(self.results)}")
        logger.info(f"失败: {len(self.failed_keywords)}")
        logger.info(f"总耗时: {elapsed/60:.1f}分钟")
        logger.info(f"平均速度: {elapsed/len(keywords):.1f}秒/关键词")
        logger.info("="*70)

    def save_progress(self):
        """保存中间进度"""
        try:
            progress_file = os.path.join(OUTPUT_DIR, f"progress_{timestamp}.json")
            with open(progress_file, 'w', encoding='utf-8') as f:
                json.dump({
                    'results': self.results,
                    'failed': self.failed_keywords,
                    'saved_at': datetime.now().isoformat()
                }, f, ensure_ascii=False, indent=2)
            logger.info(f"  ✓ 进度已保存: {progress_file}")
        except Exception as e:
            logger.error(f"  ✗ 保存进度失败: {e}")

    def save_final_results(self):
        """保存最终结果"""
        try:
            # 保存完整结果
            result_file = os.path.join(OUTPUT_DIR, f"batch_results_{timestamp}.json")
            with open(result_file, 'w', encoding='utf-8') as f:
                json.dump({
                    'summary': {
                        'total_keywords': len(self.results) + len(self.failed_keywords),
                        'successful': len(self.results),
                        'failed': len(self.failed_keywords),
                        'total_products': sum(r['product_count'] for r in self.results),
                        'timestamp': datetime.now().isoformat()
                    },
                    'results': self.results,
                    'failed_keywords': self.failed_keywords
                }, f, ensure_ascii=False, indent=2)

            logger.info(f"\n✓ 完整结果已保存: {result_file}")

            # 保存商品列表(扁平化)
            all_products = []
            for result in self.results:
                for product in result['products']:
                    product['search_keyword'] = result['keyword']
                    all_products.append(product)

            if all_products:
                products_file = os.path.join(OUTPUT_DIR, f"all_products_{timestamp}.json")
                with open(products_file, 'w', encoding='utf-8') as f:
                    json.dump(all_products, f, ensure_ascii=False, indent=2)

                logger.info(f"✓ 商品列表已保存: {products_file}")
                logger.info(f"  总商品数: {len(all_products)}")

            # 保存失败列表
            if self.failed_keywords:
                failed_file = os.path.join(OUTPUT_DIR, f"failed_keywords_{timestamp}.json")
                with open(failed_file, 'w', encoding='utf-8') as f:
                    json.dump(self.failed_keywords, f, ensure_ascii=False, indent=2)

                logger.info(f"✓ 失败列表已保存: {failed_file}")

        except Exception as e:
            logger.error(f"✗ 保存最终结果失败: {e}")

def main():
    scraper = TaobaoAPIBatchScraper()
    scraper.batch_scrape()

if __name__ == "__main__":
    main()
