tien_nemo

test1

1 +from paddleocr import PaddleOCR
2 +from pdf2image import convert_from_path
3 +import os
4 +import time
5 +import numpy as np
6 +import json
7 +from pathlib import Path
8 +import cv2
9 +from table_detector import detect_tables
10 +
11 +# ==== Config ====
12 +BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
13 +PDF_NAME = 'aaaa'
14 +
15 +# PDF path
16 +pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "fax.pdf"
17 +# Output folder
18 +output_folder = Path(BASE_DIR) / "public" / "image"
19 +
20 +#PDF_NAME = pdf_path.stem # Get the stem of the PDF file
21 +#print(PDF_NAME)
22 +
23 +os.makedirs(output_folder, exist_ok=True)
24 +
25 +timestamp = int(time.time())
26 +img_base_name = f"{PDF_NAME}_{timestamp}"
27 +
28 +# ==== OCR Init ====
29 +ocr = PaddleOCR(
30 + use_doc_orientation_classify=False,
31 + use_doc_unwarping=False,
32 + use_textline_orientation=False
33 +)
34 +
35 +# ==== PDF to Image ====
36 +pages = convert_from_path(pdf_path, first_page=1, last_page=1)
37 +image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
38 +pages[0].save(image_path, "JPEG")
39 +
40 +# ==== Run OCR ====
41 +image_np = np.array(pages[0])
42 +results = ocr.predict(image_np)
43 +
44 +# ==== Convert polygon to bbox ====
45 +def poly_to_bbox(poly):
46 + xs = [p[0] for p in poly]
47 + ys = [p[1] for p in poly]
48 + return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]
49 +
50 +# ==== Build ocrData ====
51 +ocr_data_list = []
52 +for res in results:
53 + for text, poly in zip(res['rec_texts'], res['rec_polys']):
54 + bbox = poly_to_bbox(poly)
55 + ocr_data_list.append({
56 + "text": text,
57 + "bbox": bbox,
58 + "field": "",
59 + "hideBorder": False
60 + })
61 +
62 +# ==== Detect table ====
63 +table_info = detect_tables(image_path)
64 +
65 +# ==== Build JSON ====
66 +final_json = {
67 + "ocr_data": ocr_data_list,
68 + "tables": table_info
69 +}
70 +
71 +
72 +# ==== Save JSON ====
73 +json_path = os.path.join(output_folder, f"{PDF_NAME}_{timestamp}_with_table.json")
74 +with open(json_path, "w", encoding="utf-8") as f:
75 + json.dump(final_json, f, ensure_ascii=False, indent=2)
76 +
77 +print(f"Saved OCR + Table JSON to: {json_path}")
1 +import cv2
2 +import numpy as np
3 +import os
4 +
5 +def detect_tables(image_path):
6 + img = cv2.imread(image_path)
7 + if img is None:
8 + raise FileNotFoundError(f"Không đọc được ảnh: {image_path}")
9 +
10 + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
11 + blur = cv2.GaussianBlur(gray, (3, 3), 0)
12 +
13 + # Edge detection
14 + edges = cv2.Canny(blur, 50, 150, apertureSize=3)
15 +
16 + # --- Horizontal lines ---
17 + lines_h = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=120,
18 + minLineLength=int(img.shape[1] * 0.6), maxLineGap=20)
19 + ys_candidates, line_segments = [], []
20 + if lines_h is not None:
21 + for l in lines_h:
22 + x1, y1, x2, y2 = l[0]
23 + if abs(y1 - y2) <= 3: # ngang
24 + y_mid = int(round((y1 + y2) / 2))
25 + ys_candidates.append(y_mid)
26 + line_segments.append((x1, x2, y_mid))
27 +
28 + # gom nhóm các y
29 + ys, tol_y = [], 10
30 + for y in sorted(ys_candidates):
31 + if not ys or abs(y - ys[-1]) > tol_y:
32 + ys.append(y)
33 +
34 + total_rows = max(0, len(ys) - 1)
35 +
36 + # --- Vertical lines ---
37 + lines_v = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
38 + minLineLength=int(img.shape[0] * 0.5), maxLineGap=20)
39 + xs = []
40 + if lines_v is not None:
41 + for l in lines_v:
42 + x1, y1, x2, y2 = l[0]
43 + if abs(x1 - x2) <= 3:
44 + xs.append(int(round((x1 + x2) / 2)))
45 +
46 + # gom nhóm cột
47 + x_pos, tol_v = [], 10
48 + for v in sorted(xs):
49 + if not x_pos or v - x_pos[-1] > tol_v:
50 + x_pos.append(v)
51 +
52 + total_cols = max(0, len(x_pos) - 1)
53 +
54 + tables = []
55 + if len(ys) >= 3 and line_segments:
56 + y_min, y_max = ys[0], ys[-1]
57 + min_x = min(seg[0] for seg in line_segments)
58 + max_x = max(seg[1] for seg in line_segments)
59 + table_box = (min_x, y_min, max_x, y_max)
60 +
61 + rows = []
62 + for i in range(len(ys) - 1):
63 + row_box = (min_x, ys[i], max_x, ys[i+1])
64 + rows.append({"row": tuple(int(v) for v in row_box)})
65 + cv2.rectangle(img, (row_box[0], row_box[1]), (row_box[2], row_box[3]), (0, 255, 255), 2)
66 +
67 + tables.append({
68 + "total_rows": int(total_rows),
69 + "total_cols": int(total_cols),
70 + "table_box": tuple(int(v) for v in table_box),
71 + "rows_box": rows
72 + })
73 +
74 + cv2.rectangle(img, (min_x, y_min), (max_x, y_max), (255, 0, 0), 3)
75 +
76 + debug_path = os.path.splitext(image_path)[0] + "_debug.jpg"
77 + cv2.imwrite(debug_path, img)
78 +
79 + return tables