extrac_table.py 1.96 KB
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
from pathlib import Path
import cv2
from table_detector import detect_tables

# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
PDF_NAME = 'aaaa'

# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "fax.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"

#PDF_NAME = pdf_path.stem  # Get the stem of the PDF file
#print(PDF_NAME)

os.makedirs(output_folder, exist_ok=True)

timestamp = int(time.time())
img_base_name = f"{PDF_NAME}_{timestamp}"

# ==== OCR Init ====
ocr = PaddleOCR(
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")

# ==== Run OCR ====
image_np = np.array(pages[0])
results = ocr.predict(image_np)

# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
    xs = [p[0] for p in poly]
    ys = [p[1] for p in poly]
    return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]

# ==== Build ocrData ====
ocr_data_list = []
for res in results:
    for text, poly in zip(res['rec_texts'], res['rec_polys']):
        bbox = poly_to_bbox(poly)
        ocr_data_list.append({
            "text": text,
            "bbox": bbox,
            "field": "",
            "hideBorder": False
        })

# ==== Detect table ====
table_info = detect_tables(image_path)

# ==== Build JSON ====
final_json = {
    "ocr_data": ocr_data_list,
    "tables": table_info
}


# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(final_json, f, ensure_ascii=False, indent=2)

print(f"Saved OCR + Table JSON to: {json_path}")