import argparse
from pdf2image import convert_from_path
import pytesseract

def convert_pdf_to_txt_ocr(pdf_path, txt_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang='chi_sim+eng') + "\n\n"  # Assuming Chinese + English
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Converted '{pdf_path}' to '{txt_path}' with OCR")

def main():
    parser = argparse.ArgumentParser(description="Convert scanned PDF to TXT using OCR")
    parser.add_argument("pdf", help="Path to input PDF file")
    parser.add_argument("-o", "--output", help="Path to output TXT file (default: same as input with _ocr.txt extension)")
    args = parser.parse_args()
    
    pdf_path = args.pdf
    txt_path = args.output or pdf_path.replace('.pdf', '_ocr.txt')
    
    convert_pdf_to_txt_ocr(pdf_path, txt_path)

if __name__ == "__main__":
    main()