#!/usr/bin/env python3
import json
import re
from collections import Counter

def is_chinese_text(text):
    """检测文本是否包含中文字符"""
    if not text:
        return False
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
    return bool(chinese_pattern.search(text))

def analyze_chinese_books(file_path):
    chinese_books = []
    total_count = 0
    
    print("正在分析数据...")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            if line_num % 100000 == 0:
                print(f"已处理 {line_num} 条记录...")
            
            try:
                data = json.loads(line.strip())
                metadata = data.get('metadata', {})
                
                title = metadata.get('title', '')
                author = metadata.get('author', '')
                language = metadata.get('language', '')
                
                # 检查是否为中文书籍
                if (language == 'chinese' or 
                    is_chinese_text(title) or 
                    is_chinese_text(author)):
                    chinese_books.append(metadata)
                
                total_count += 1
                
            except json.JSONDecodeError:
                continue
    
    print(f"\n📊 中文书籍统计结果:")
    print(f"总记录数: {total_count:,}")
    print(f"中文书籍数: {len(chinese_books):,}")
    print(f"中文书籍占比: {len(chinese_books)/total_count*100:.2f}%")
    
    if chinese_books:
        # 统计作者
        authors = Counter(book.get('author', '未知') for book in chinese_books if book.get('author'))
        print(f"\n📚 热门作者 (前10):")
        for author, count in authors.most_common(10):
            print(f"  {author}: {count} 本")
        
        # 统计出版社
        publishers = Counter(book.get('publisher', '未知') for book in chinese_books if book.get('publisher'))
        print(f"\n🏢 热门出版社 (前10):")
        for pub, count in publishers.most_common(10):
            print(f"  {pub}: {count} 本")
        
        # 统计年份
        years = Counter(book.get('year', '未知') for book in chinese_books if book.get('year'))
        print(f"\n📅 出版年份分布 (前10):")
        for year, count in sorted(years.most_common(10), key=lambda x: x[0], reverse=True):
            print(f"  {year}: {count} 本")
        
        # 统计文件格式
        formats = Counter(book.get('extension', '未知') for book in chinese_books)
        print(f"\n📄 文件格式分布:")
        for fmt, count in formats.most_common():
            print(f"  {fmt}: {count} 本")
    
    return chinese_books

if __name__ == "__main__":
    chinese_books = analyze_chinese_books('/root/recovered_data.jsonl')
    
    # 保存中文书籍数据
    with open('/root/chinese_books.json', 'w', encoding='utf-8') as f:
        json.dump(chinese_books, f, ensure_ascii=False, indent=2)
    
    print(f"\n✅ 中文书籍数据已保存到 chinese_books.json")
