extrac_table.py
5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
from pathlib import Path
import cv2
from table_detector import detect_tables
from PIL import Image, ImageEnhance
# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
# PDF_NAME = 'aaaa'
# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "3.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"
PDF_NAME = pdf_path.stem # Get the stem of the PDF file
#print(PDF_NAME)
os.makedirs(output_folder, exist_ok=True)
timestamp = int(time.time())
img_base_name = f"{PDF_NAME}_{timestamp}"
# ==== OCR Init ====
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False
)
# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")
# ==== Run OCR ====
image_np = np.array(pages[0])
def estimate_text_ratio(gray, block_size=256):
"""Tính median text_ratio theo block nhỏ"""
h, w = gray.shape
ratios = []
for y in range(0, h, block_size):
for x in range(0, w, block_size):
block = gray[y:y+block_size, x:x+block_size]
if block.size == 0:
continue
_, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
text_mask = 255 - binary
ratio = np.sum(text_mask > 0) / text_mask.size
ratios.append(ratio)
if len(ratios) == 0:
return 0.0
return np.median(ratios) # trung vị để tránh bị outlier
def bolden_text(rgb_img: np.ndarray,
kernel_size: int = 3,
iterations: int = 1,
contrast: float = 1.5,
sharpness: float = 1.2) -> np.ndarray:
"""
Làm đậm chữ trong ảnh RGB:
- kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
- iterations: số lần dilate
- contrast: hệ số tăng tương phản (>=1.0)
- sharpness: hệ số tăng nét (>=1.0)
"""
# RGB -> Gray
gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
# Nhị phân Otsu
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Text mask: chữ = 255
threshold = 0.02
text_ratio = estimate_text_ratio(gray, block_size=256)
print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")
if text_ratio > threshold:
return rgb_img
# Dilation
text_mask = 255 - binary
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)
# Overlay lên gray gốc
inv_gray = 255 - gray
inv_gray_boost = np.maximum(inv_gray, bold_mask)
out_gray = 255 - inv_gray_boost
# Tăng contrast (linear scale)
out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)
# Tăng sharpness bằng unsharp mask
blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)
# Trả về RGB cho PaddleOCR
out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)
debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
cv2.imwrite(debug_path, cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR))
print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
return out_rgb
preprocessed = bolden_text(
image_np,
kernel_size=3, # tăng lên 3 nếu chữ vẫn mảnh
iterations=1, # tăng lên 2 nếu muốn đậm hơn
contrast=1.5, # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
sharpness=1.2 # >1.0 để nét hơn
)
# Gọi OCR (đảm bảo 3 kênh)
if preprocessed.ndim == 2:
preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)
results = ocr.predict(preprocessed)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
xs = [p[0] for p in poly]
ys = [p[1] for p in poly]
return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]
# ==== Build ocrData ====
ocr_data_list = []
for res in results:
for text, poly in zip(res['rec_texts'], res['rec_polys']):
bbox = poly_to_bbox(poly)
ocr_data_list.append({
"text": text,
"bbox": bbox,
"field": "",
"hideBorder": False
})
# ==== Detect table ====
table_info = detect_tables(image_path)
for table in table_info:
for row in table["cells"]: # row là list các cell dict
for cell in row:
x1, y1, x2, y2 = cell["cell"]
cell_texts = []
for item in ocr_data_list:
bx1, by1, bx2, by2 = item["bbox"]
if bx1 >= x1 and by1 >= y1 and bx2 <= x2 and by2 <= y2:
cell_texts.append(item["text"])
# thêm vào cell gốc
cell["texts"] = cell_texts
cell["text"] = " ".join(cell_texts)
# ==== Build JSON ====
final_json = {
"ocr_data": ocr_data_list,
"tables": table_info
}
# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(final_json, f, ensure_ascii=False, indent=2)
print(f"Saved OCR + Table JSON to: {json_path}")