extrac_table.py 6.42 KB
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
from pathlib import Path
import cv2
from table_detector import detect_tables
from PIL import Image, ImageEnhance

# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
PDF_NAME = 'nemo_new'

# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "2.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"

# PDF_NAME = pdf_path.stem  # Get the stem of the PDF file
#print(PDF_NAME)

os.makedirs(output_folder, exist_ok=True)

timestamp = int(time.time())
img_base_name = f"{PDF_NAME}_{timestamp}"

# ==== OCR Init ====
ocr = PaddleOCR(
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")

# ==== Run OCR ====
image_np = np.array(pages[0])

def estimate_text_ratio(gray, block_size=256):
    """Tính median text_ratio theo block nhỏ"""
    h, w = gray.shape
    ratios = []
    for y in range(0, h, block_size):
        for x in range(0, w, block_size):
            block = gray[y:y+block_size, x:x+block_size]
            if block.size == 0:
                continue
            _, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            text_mask = 255 - binary
            ratio = np.sum(text_mask > 0) / text_mask.size
            ratios.append(ratio)

    if len(ratios) == 0:
        return 0.0
    return np.median(ratios)  # trung vị để tránh bị outlier

def bolden_text(rgb_img: np.ndarray,
                kernel_size: int = 3,
                iterations: int = 1,
                contrast: float = 1.5,
                sharpness: float = 1.2) -> np.ndarray:
    """
    Làm đậm chữ trong ảnh RGB:
      - kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
      - iterations: số lần dilate
      - contrast: hệ số tăng tương phản (>=1.0)
      - sharpness: hệ số tăng nét (>=1.0)
    """
    # RGB -> Gray
    gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)

    # Nhị phân Otsu
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Text mask: chữ = 255
    threshold = 0.02
    text_ratio = estimate_text_ratio(gray, block_size=256)
    print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")

    if text_ratio > threshold:
        return rgb_img

    # Dilation
    text_mask = 255 - binary
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
    bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)

    # Overlay lên gray gốc
    inv_gray = 255 - gray
    inv_gray_boost = np.maximum(inv_gray, bold_mask)
    out_gray = 255 - inv_gray_boost

    # Tăng contrast (linear scale)
    out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)

    # Tăng sharpness bằng unsharp mask
    blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
    out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)

    # Trả về RGB cho PaddleOCR
    out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)
    debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
    cv2.imwrite(debug_path, cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR))
    print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
    return out_rgb

preprocessed = bolden_text(
    image_np,
    kernel_size=3,     # tăng lên 3 nếu chữ vẫn mảnh
    iterations=1,      # tăng lên 2 nếu muốn đậm hơn
    contrast=1.5,      # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
    sharpness=1.2      # >1.0 để nét hơn
)



# Gọi OCR (đảm bảo 3 kênh)
if preprocessed.ndim == 2:
    preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)

results = ocr.predict(preprocessed)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
    xs = [p[0] for p in poly]
    ys = [p[1] for p in poly]
    return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]

# ==== Build ocrData ====
ocr_data_list = []
for res in results:
    for text, poly in zip(res['rec_texts'], res['rec_polys']):
        bbox = poly_to_bbox(poly)
        ocr_data_list.append({
            "text": text,
            "bbox": bbox,
            "field": "",
            "hideBorder": False
        })

# ==== Detect table ====
table_info = detect_tables(image_path)

for table in table_info:
    for row in table["cells"]:  # row là list các cell dict
        for cell in row:
            x1, y1, x2, y2 = cell["cell"]
            cell_texts = []

            # Helper: compute overlap ratio of bbox against cell
            def overlap_ratio(bbox, cell_box):
                ix1 = max(bbox[0], cell_box[0])
                iy1 = max(bbox[1], cell_box[1])
                ix2 = min(bbox[2], cell_box[2])
                iy2 = min(bbox[3], cell_box[3])
                iw = max(0, ix2 - ix1)
                ih = max(0, iy2 - iy1)
                inter = iw * ih
                bbox_area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
                return inter / float(bbox_area)

            # Helper: check center inside cell
            def center_inside(bbox, cell_box):
                cx = (bbox[0] + bbox[2]) / 2.0
                cy = (bbox[1] + bbox[3]) / 2.0
                return (cx >= cell_box[0] and cx <= cell_box[2] and
                        cy >= cell_box[1] and cy <= cell_box[3])

            cell_box = [x1, y1, x2, y2]
            for item in ocr_data_list:
                bx1, by1, bx2, by2 = item["bbox"]
                bbox = [bx1, by1, bx2, by2]
                # Accept if bbox is largely inside the cell, or its center lies inside the cell
                if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box):
                    cell_texts.append(item["text"])

            # thêm vào cell gốc
            cell["texts"] = cell_texts
            cell["text"] = " ".join(cell_texts)
# ==== Build JSON ====
final_json = {
    "ocr_data": ocr_data_list,
    "tables": table_info
}


# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(final_json, f, ensure_ascii=False, indent=2)

print(f"Saved OCR + Table JSON to: {json_path}")