extrac_table.py
7.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
from pathlib import Path
import cv2
from table_detector import detect_tables
from PIL import Image, ImageEnhance
# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
PDF_NAME = 'nemo_new'
# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "Iwasaki_1.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"
# PDF_NAME = pdf_path.stem # Get the stem of the PDF file
#print(PDF_NAME)
os.makedirs(output_folder, exist_ok=True)
timestamp = int(time.time())
img_base_name = f"{PDF_NAME}_{timestamp}"
# ==== OCR Init ====
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False
)
# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")
# ==== Run OCR ====
image_np = np.array(pages[0])
def estimate_text_ratio(gray, block_size=256):
"""Tính median text_ratio theo block nhỏ"""
h, w = gray.shape
ratios = []
for y in range(0, h, block_size):
for x in range(0, w, block_size):
block = gray[y:y+block_size, x:x+block_size]
if block.size == 0:
continue
_, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
text_mask = 255 - binary
ratio = np.sum(text_mask > 0) / text_mask.size
ratios.append(ratio)
if len(ratios) == 0:
return 0.0
return np.median(ratios) # trung vị để tránh bị outlier
def bolden_text(rgb_img: np.ndarray,
kernel_size: int = 3,
iterations: int = 1,
contrast: float = 1.5,
sharpness: float = 1.2) -> np.ndarray:
"""
Làm đậm chữ trong ảnh RGB:
- kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
- iterations: số lần dilate
- contrast: hệ số tăng tương phản (>=1.0)
- sharpness: hệ số tăng nét (>=1.0)
"""
# RGB -> Gray
gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
# Nhị phân Otsu
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Text mask: chữ = 255
threshold = 0.02
text_ratio = estimate_text_ratio(gray, block_size=256)
print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")
debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
if text_ratio > threshold:
return rgb_img, debug_path
# Dilation
text_mask = 255 - binary
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)
# Overlay lên gray gốc
inv_gray = 255 - gray
inv_gray_boost = np.maximum(inv_gray, bold_mask)
out_gray = 255 - inv_gray_boost
# Tăng contrast (linear scale)
out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)
# Tăng sharpness bằng unsharp mask
blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)
# Trả về RGB cho PaddleOCR
out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)
cv2.imwrite(debug_path, cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR))
print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
return out_rgb,debug_path
preprocessed,debug_file = bolden_text(
image_np,
kernel_size=3, # tăng lên 3 nếu chữ vẫn mảnh
iterations=1, # tăng lên 2 nếu muốn đậm hơn
contrast=1.5, # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
sharpness=1.2 # >1.0 để nét hơn
)
# Gọi OCR (đảm bảo 3 kênh)
if preprocessed.ndim == 2:
preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)
results = ocr.predict(preprocessed)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
xs = [p[0] for p in poly]
ys = [p[1] for p in poly]
return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]
# ==== Build ocrData ====
ocr_data_list = []
for res in results:
for text, poly in zip(res['rec_texts'], res['rec_polys']):
bbox = poly_to_bbox(poly)
ocr_data_list.append({
"text": text,
"bbox": bbox,
"field": "",
"hideBorder": False
})
def overlap_ratio(bbox, cell_box):
ix1 = max(bbox[0], cell_box[0])
iy1 = max(bbox[1], cell_box[1])
ix2 = min(bbox[2], cell_box[2])
iy2 = min(bbox[3], cell_box[3])
iw = max(0, ix2 - ix1)
ih = max(0, iy2 - iy1)
inter = iw * ih
bbox_area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
return inter / float(bbox_area)
def center_inside(bbox, cell_box):
cx = (bbox[0] + bbox[2]) / 2.0
cy = (bbox[1] + bbox[3]) / 2.0
return (cx >= cell_box[0] and cx <= cell_box[2] and
cy >= cell_box[1] and cy <= cell_box[3])
# ==== Detect table ====
if debug_file and os.path.exists(debug_file):
image_path = debug_file
table_info = detect_tables(image_path)
for table_index, table in enumerate(table_info):
for row in table["cells"]: # row là list các cell dict
for cell in row:
x1, y1, x2, y2 = cell["cell"]
cell_box = [x1, y1, x2, y2]
# Lọc OCR nằm trong ô
items_in_cell = [
item for item in ocr_data_list
if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
]
# Danh sách text trong cell
cell_texts = []
if items_in_cell:
# Sắp xếp OCR theo y rồi x
items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))
# Gom OCR xuống dòng dựa trên y
line_groups = []
current_group = [items_in_cell[0]]
for it in items_in_cell[1:]:
if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5: # threshold 5 px
line_groups.append(current_group)
current_group = [it]
else:
current_group.append(it)
line_groups.append(current_group)
# Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
multi_line = len(line_groups) > 1
for sub_i, group in enumerate(line_groups, start=1):
for it in group:
cell_texts.append(it["text"])
it["table"] = {
"bbox": {
"table_index": table_index,
"row_idx": cell["row_idx"],
"col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
}
}
# Lưu text gộp trong cell
cell["texts"] = cell_texts
cell["text"] = " ".join(cell_texts)
# ==== Build JSON ====
final_json = {
"ocr_data": ocr_data_list,
"tables": table_info
}
# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(final_json, f, ensure_ascii=False, indent=2)
print(f"Saved OCR + Table JSON to: {json_path}")