Showing
2 changed files
with
156 additions
and
0 deletions
app/Services/OCR/extrac_table.py
0 → 100644
| 1 | +from paddleocr import PaddleOCR | ||
| 2 | +from pdf2image import convert_from_path | ||
| 3 | +import os | ||
| 4 | +import time | ||
| 5 | +import numpy as np | ||
| 6 | +import json | ||
| 7 | +from pathlib import Path | ||
| 8 | +import cv2 | ||
| 9 | +from table_detector import detect_tables | ||
| 10 | + | ||
| 11 | +# ==== Config ==== | ||
| 12 | +BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) | ||
| 13 | +PDF_NAME = 'aaaa' | ||
| 14 | + | ||
| 15 | +# PDF path | ||
| 16 | +pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "fax.pdf" | ||
| 17 | +# Output folder | ||
| 18 | +output_folder = Path(BASE_DIR) / "public" / "image" | ||
| 19 | + | ||
| 20 | +#PDF_NAME = pdf_path.stem # Get the stem of the PDF file | ||
| 21 | +#print(PDF_NAME) | ||
| 22 | + | ||
| 23 | +os.makedirs(output_folder, exist_ok=True) | ||
| 24 | + | ||
| 25 | +timestamp = int(time.time()) | ||
| 26 | +img_base_name = f"{PDF_NAME}_{timestamp}" | ||
| 27 | + | ||
| 28 | +# ==== OCR Init ==== | ||
| 29 | +ocr = PaddleOCR( | ||
| 30 | + use_doc_orientation_classify=False, | ||
| 31 | + use_doc_unwarping=False, | ||
| 32 | + use_textline_orientation=False | ||
| 33 | +) | ||
| 34 | + | ||
| 35 | +# ==== PDF to Image ==== | ||
| 36 | +pages = convert_from_path(pdf_path, first_page=1, last_page=1) | ||
| 37 | +image_path = os.path.join(output_folder, f"{img_base_name}.jpg") | ||
| 38 | +pages[0].save(image_path, "JPEG") | ||
| 39 | + | ||
| 40 | +# ==== Run OCR ==== | ||
| 41 | +image_np = np.array(pages[0]) | ||
| 42 | +results = ocr.predict(image_np) | ||
| 43 | + | ||
| 44 | +# ==== Convert polygon to bbox ==== | ||
| 45 | +def poly_to_bbox(poly): | ||
| 46 | + xs = [p[0] for p in poly] | ||
| 47 | + ys = [p[1] for p in poly] | ||
| 48 | + return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))] | ||
| 49 | + | ||
| 50 | +# ==== Build ocrData ==== | ||
| 51 | +ocr_data_list = [] | ||
| 52 | +for res in results: | ||
| 53 | + for text, poly in zip(res['rec_texts'], res['rec_polys']): | ||
| 54 | + bbox = poly_to_bbox(poly) | ||
| 55 | + ocr_data_list.append({ | ||
| 56 | + "text": text, | ||
| 57 | + "bbox": bbox, | ||
| 58 | + "field": "", | ||
| 59 | + "hideBorder": False | ||
| 60 | + }) | ||
| 61 | + | ||
| 62 | +# ==== Detect table ==== | ||
| 63 | +table_info = detect_tables(image_path) | ||
| 64 | + | ||
| 65 | +# ==== Build JSON ==== | ||
| 66 | +final_json = { | ||
| 67 | + "ocr_data": ocr_data_list, | ||
| 68 | + "tables": table_info | ||
| 69 | +} | ||
| 70 | + | ||
| 71 | + | ||
| 72 | +# ==== Save JSON ==== | ||
| 73 | +json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json") | ||
| 74 | +with open(json_path, "w", encoding="utf-8") as f: | ||
| 75 | + json.dump(final_json, f, ensure_ascii=False, indent=2) | ||
| 76 | + | ||
| 77 | +print(f"Saved OCR + Table JSON to: {json_path}") |
app/Services/OCR/table_detector.py
0 → 100644
| 1 | +import cv2 | ||
| 2 | +import numpy as np | ||
| 3 | +import os | ||
| 4 | + | ||
| 5 | +def detect_tables(image_path): | ||
| 6 | + img = cv2.imread(image_path) | ||
| 7 | + if img is None: | ||
| 8 | + raise FileNotFoundError(f"Không đọc được ảnh: {image_path}") | ||
| 9 | + | ||
| 10 | + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | ||
| 11 | + blur = cv2.GaussianBlur(gray, (3, 3), 0) | ||
| 12 | + | ||
| 13 | + # Edge detection | ||
| 14 | + edges = cv2.Canny(blur, 50, 150, apertureSize=3) | ||
| 15 | + | ||
| 16 | + # --- Horizontal lines --- | ||
| 17 | + lines_h = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=120, | ||
| 18 | + minLineLength=int(img.shape[1] * 0.6), maxLineGap=20) | ||
| 19 | + ys_candidates, line_segments = [], [] | ||
| 20 | + if lines_h is not None: | ||
| 21 | + for l in lines_h: | ||
| 22 | + x1, y1, x2, y2 = l[0] | ||
| 23 | + if abs(y1 - y2) <= 3: # ngang | ||
| 24 | + y_mid = int(round((y1 + y2) / 2)) | ||
| 25 | + ys_candidates.append(y_mid) | ||
| 26 | + line_segments.append((x1, x2, y_mid)) | ||
| 27 | + | ||
| 28 | + # gom nhóm các y | ||
| 29 | + ys, tol_y = [], 10 | ||
| 30 | + for y in sorted(ys_candidates): | ||
| 31 | + if not ys or abs(y - ys[-1]) > tol_y: | ||
| 32 | + ys.append(y) | ||
| 33 | + | ||
| 34 | + total_rows = max(0, len(ys) - 1) | ||
| 35 | + | ||
| 36 | + # --- Vertical lines --- | ||
| 37 | + lines_v = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, | ||
| 38 | + minLineLength=int(img.shape[0] * 0.5), maxLineGap=20) | ||
| 39 | + xs = [] | ||
| 40 | + if lines_v is not None: | ||
| 41 | + for l in lines_v: | ||
| 42 | + x1, y1, x2, y2 = l[0] | ||
| 43 | + if abs(x1 - x2) <= 3: | ||
| 44 | + xs.append(int(round((x1 + x2) / 2))) | ||
| 45 | + | ||
| 46 | + # gom nhóm cột | ||
| 47 | + x_pos, tol_v = [], 10 | ||
| 48 | + for v in sorted(xs): | ||
| 49 | + if not x_pos or v - x_pos[-1] > tol_v: | ||
| 50 | + x_pos.append(v) | ||
| 51 | + | ||
| 52 | + total_cols = max(0, len(x_pos) - 1) | ||
| 53 | + | ||
| 54 | + tables = [] | ||
| 55 | + if len(ys) >= 3 and line_segments: | ||
| 56 | + y_min, y_max = ys[0], ys[-1] | ||
| 57 | + min_x = min(seg[0] for seg in line_segments) | ||
| 58 | + max_x = max(seg[1] for seg in line_segments) | ||
| 59 | + table_box = (min_x, y_min, max_x, y_max) | ||
| 60 | + | ||
| 61 | + rows = [] | ||
| 62 | + for i in range(len(ys) - 1): | ||
| 63 | + row_box = (min_x, ys[i], max_x, ys[i+1]) | ||
| 64 | + rows.append({"row": tuple(int(v) for v in row_box)}) | ||
| 65 | + cv2.rectangle(img, (row_box[0], row_box[1]), (row_box[2], row_box[3]), (0, 255, 255), 2) | ||
| 66 | + | ||
| 67 | + tables.append({ | ||
| 68 | + "total_rows": int(total_rows), | ||
| 69 | + "total_cols": int(total_cols), | ||
| 70 | + "table_box": tuple(int(v) for v in table_box), | ||
| 71 | + "rows_box": rows | ||
| 72 | + }) | ||
| 73 | + | ||
| 74 | + cv2.rectangle(img, (min_x, y_min), (max_x, y_max), (255, 0, 0), 3) | ||
| 75 | + | ||
| 76 | + debug_path = os.path.splitext(image_path)[0] + "_debug.jpg" | ||
| 77 | + cv2.imwrite(debug_path, img) | ||
| 78 | + | ||
| 79 | + return tables |
-
Please register or sign in to post a comment