#!/usr/bin/env python3
import json
import re
from collections import Counter

def is_chinese_text(text):
    """检测文本是否包含中文字符"""
    if not text:
        return False
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
    return bool(chinese_pattern.search(text))

def process_partial_recovery():
    print("🔄 处理 partial recovery 数据...")
    
    # 读取现有的中文书籍数据
    try:
        with open('/root/book-manager/data/chinese_books.json', 'r', encoding='utf-8') as f:
            existing_books = json.load(f)
        existing_ids = {book['zlibrary_id'] for book in existing_books}
        print(f"现有中文书籍: {len(existing_books)} 本")
    except:
        existing_books = []
        existing_ids = set()
        print("未找到现有数据，将创建新数据")
    
    # 处理新数据
    new_chinese_books = []
    total_processed = 0
    
    with open('/root/annas_archive_zlib3_records_recovered.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                metadata = data.get('metadata', {})
                
                # 检查是否已存在
                zlib_id = metadata.get('zlibrary_id')
                if zlib_id in existing_ids:
                    continue
                
                title = metadata.get('title', '')
                author = metadata.get('author', '')
                language = metadata.get('language', '')
                
                # 检查是否为中文书籍
                if (language == 'chinese' or 
                    is_chinese_text(title) or 
                    is_chinese_text(author)):
                    new_chinese_books.append(metadata)
                
                total_processed += 1
                
            except json.JSONDecodeError:
                continue
    
    print(f"处理记录: {total_processed} 条")
    print(f"新增中文书籍: {len(new_chinese_books)} 本")
    
    if new_chinese_books:
        # 合并数据
        all_books = existing_books + new_chinese_books
        
        # 保存合并后的数据
        with open('/root/book-manager/data/chinese_books.json', 'w', encoding='utf-8') as f:
            json.dump(all_books, f, ensure_ascii=False, indent=2)
        
        print(f"✅ 数据已更新，总计: {len(all_books)} 本中文书籍")
        
        # 显示新增书籍的统计
        if new_chinese_books:
            print(f"\n📊 新增书籍统计:")
            
            # 统计格式
            formats = Counter(book.get('extension', '未知') for book in new_chinese_books)
            print(f"文件格式:")
            for fmt, count in formats.most_common(5):
                print(f"  {fmt.upper()}: {count} 本")
            
            # 统计语言
            languages = Counter(book.get('language', '未知') for book in new_chinese_books)
            print(f"语言分布:")
            for lang, count in languages.most_common():
                print(f"  {lang}: {count} 本")
    else:
        print("❌ 没有找到新的中文书籍")

if __name__ == "__main__":
    process_partial_recovery()