#!/usr/bin/env python3
import pytesseract
from pdf2image import convert_from_path
import os

def process_pdf_batch(pdf_path, output_path, batch_size=10):
    """分批处理PDF以避免内存问题"""
    try:
        # 获取总页数
        pages = convert_from_path(pdf_path, first_page=1, last_page=1)
        print(f"📄 开始处理PDF文件...")
        
        full_text = ""
        page_num = 1
        
        while True:
            try:
                print(f"🔄 处理第 {page_num}-{page_num+batch_size-1} 页...")
                batch_pages = convert_from_path(
                    pdf_path, 
                    first_page=page_num, 
                    last_page=page_num+batch_size-1,
                    dpi=200
                )
                
                if not batch_pages:
                    break
                
                for i, page in enumerate(batch_pages):
                    current_page = page_num + i
                    print(f"🔍 OCR第 {current_page} 页...")
                    text = pytesseract.image_to_string(page, lang='chi_sim+eng')
                    full_text += f"--- 第{current_page}页 ---\n{text}\n\n"
                
                page_num += batch_size
                
                # 每处理50页保存一次
                if page_num % 50 == 1:
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(full_text)
                    print(f"💾 已保存到第 {page_num-1} 页")
                
            except Exception as e:
                print(f"⚠️ 批次处理错误: {e}")
                break
        
        # 最终保存
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(full_text)
        
        print(f"✅ 转换完成！")
        print(f"📁 输出文件: {output_path}")
        print(f"📊 文件大小: {os.path.getsize(output_path)} 字节")
        
    except Exception as e:
        print(f"❌ 转换失败: {e}")

if __name__ == "__main__":
    pdf_file = '/root/redirection?filename=许建平解说金瓶梅 (许建平著, Xu Jianping zhu, Jianping Xu, 许建平, 1958- etc.) (Z-Library).pdf&s=davinci&md5=v_T4LyrTS7Sj7g5fUgC3TA&expires=1769341948'
    output_file = '/root/金瓶梅解说_完整OCR.txt'
    
    process_pdf_batch(pdf_file, output_file)
