extrac_table.py 7.48 KB

Raw Blame History Permalink

from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
from pathlib import Path
import cv2
from table_detector import detect_tables
from PIL import Image, ImageEnhance

# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
PDF_NAME = 'nemo_new'

# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "Iwasaki_1.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"

# PDF_NAME = pdf_path.stem  # Get the stem of the PDF file
#print(PDF_NAME)

os.makedirs(output_folder, exist_ok=True)

timestamp = int(time.time())
img_base_name = f"{PDF_NAME}_{timestamp}"

# ==== OCR Init ====
ocr = PaddleOCR(
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")

# ==== Run OCR ====
image_np = np.array(pages[0])

def estimate_text_ratio(gray, block_size=256):
    """Tính median text_ratio theo block nhỏ"""
    h, w = gray.shape
    ratios = []
    for y in range(0, h, block_size):
        for x in range(0, w, block_size):
            block = gray[y:y+block_size, x:x+block_size]
            if block.size == 0:
                continue
            _, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            text_mask = 255 - binary
            ratio = np.sum(text_mask > 0) / text_mask.size
            ratios.append(ratio)

    if len(ratios) == 0:
        return 0.0
    return np.median(ratios)  # trung vị để tránh bị outlier

def bolden_text(rgb_img: np.ndarray,
                kernel_size: int = 3,
                iterations: int = 1,
                contrast: float = 1.5,
                sharpness: float = 1.2) -> np.ndarray:
    """
    Làm đậm chữ trong ảnh RGB:
      - kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
      - iterations: số lần dilate
      - contrast: hệ số tăng tương phản (>=1.0)
      - sharpness: hệ số tăng nét (>=1.0)
    """
    # RGB -> Gray
    gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)

    # Nhị phân Otsu
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Text mask: chữ = 255
    threshold = 0.02
    text_ratio = estimate_text_ratio(gray, block_size=256)
    print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")
    debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
    if text_ratio > threshold:
        return rgb_img, debug_path

    # Dilation
    text_mask = 255 - binary
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
    bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)

    # Overlay lên gray gốc
    inv_gray = 255 - gray
    inv_gray_boost = np.maximum(inv_gray, bold_mask)
    out_gray = 255 - inv_gray_boost

    # Tăng contrast (linear scale)
    out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)

    # Tăng sharpness bằng unsharp mask
    blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
    out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)

    # Trả về RGB cho PaddleOCR
    out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)

    cv2.imwrite(debug_path, cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR))
    print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
    return out_rgb,debug_path

preprocessed,debug_file = bolden_text(
    image_np,
    kernel_size=3,     # tăng lên 3 nếu chữ vẫn mảnh
    iterations=1,      # tăng lên 2 nếu muốn đậm hơn
    contrast=1.5,      # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
    sharpness=1.2      # >1.0 để nét hơn
)


# Gọi OCR (đảm bảo 3 kênh)
if preprocessed.ndim == 2:
    preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)

results = ocr.predict(preprocessed)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
    xs = [p[0] for p in poly]
    ys = [p[1] for p in poly]
    return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]

# ==== Build ocrData ====
ocr_data_list = []
for res in results:
    for text, poly in zip(res['rec_texts'], res['rec_polys']):
        bbox = poly_to_bbox(poly)
        ocr_data_list.append({
            "text": text,
            "bbox": bbox,
            "field": "",
            "hideBorder": False
        })
def overlap_ratio(bbox, cell_box):
    ix1 = max(bbox[0], cell_box[0])
    iy1 = max(bbox[1], cell_box[1])
    ix2 = min(bbox[2], cell_box[2])
    iy2 = min(bbox[3], cell_box[3])
    iw = max(0, ix2 - ix1)
    ih = max(0, iy2 - iy1)
    inter = iw * ih
    bbox_area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
    return inter / float(bbox_area)
def center_inside(bbox, cell_box):
    cx = (bbox[0] + bbox[2]) / 2.0
    cy = (bbox[1] + bbox[3]) / 2.0
    return (cx >= cell_box[0] and cx <= cell_box[2] and
            cy >= cell_box[1] and cy <= cell_box[3])
# ==== Detect table ====
if debug_file and os.path.exists(debug_file):
    image_path = debug_file
table_info = detect_tables(image_path)

for table_index, table in enumerate(table_info):
    for row in table["cells"]:  # row là list các cell dict
        for cell in row:
            x1, y1, x2, y2 = cell["cell"]
            cell_box = [x1, y1, x2, y2]

            # Lọc OCR nằm trong ô
            items_in_cell = [
                item for item in ocr_data_list
                if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
            ]

            # Danh sách text trong cell
            cell_texts = []

            if items_in_cell:
                # Sắp xếp OCR theo y rồi x
                items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))

                # Gom OCR xuống dòng dựa trên y
                line_groups = []
                current_group = [items_in_cell[0]]
                for it in items_in_cell[1:]:
                    if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5:  # threshold 5 px
                        line_groups.append(current_group)
                        current_group = [it]
                    else:
                        current_group.append(it)
                line_groups.append(current_group)

                # Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
                multi_line = len(line_groups) > 1
                for sub_i, group in enumerate(line_groups, start=1):
                    for it in group:
                        cell_texts.append(it["text"])
                        it["table"] = {
                            "bbox": {
                                "table_index": table_index,
                                "row_idx": cell["row_idx"],
                                "col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
                            }
                        }

            # Lưu text gộp trong cell
            cell["texts"] = cell_texts
            cell["text"] = " ".join(cell_texts)
# ==== Build JSON ====
final_json = {
    "ocr_data": ocr_data_list,
    "tables": table_info
}


# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(final_json, f, ensure_ascii=False, indent=2)

print(f"Saved OCR + Table JSON to: {json_path}")