222

tien_nemo
Commit 0513e5e401b59142bd9594953c14e4e48733abba 0513e5e4 1 parent 644aced6
Showing 1 changed file with 35 additions and 12 deletions
app/Services/OCR/extrac_table.py
--- a/app/Services/OCR/extrac_table.py
View file @0513e5e
+++ b/app/Services/OCR/extrac_table.py
View file @0513e5e
@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file):
     image_path = debug_file
 table_info = detect_tables(image_path)
 
- for index, table in enumerate(table_info):
+ for table_index, table in enumerate(table_info):
     for row in table["cells"]:  # row là list các cell dict
         for cell in row:
             x1, y1, x2, y2 = cell["cell"]
-             cell_texts = []
             cell_box = [x1, y1, x2, y2]
-             for item in ocr_data_list:
-                 bx1, by1, bx2, by2 = item["bbox"]
-                 bbox = [bx1, by1, bx2, by2]
-                 # Accept if bbox is largely inside the cell, or its center lies inside the cell
-                 if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box):
-                     cell_texts.append(item["text"])
-                     item["table"] = {
+ 
+             # Lọc OCR nằm trong ô
+             items_in_cell = [
+                 item for item in ocr_data_list
+                 if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
+             ]
+ 
+             # Danh sách text trong cell
+             cell_texts = []
+ 
+             if items_in_cell:
+                 # Sắp xếp OCR theo y rồi x
+                 items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))
+ 
+                 # Gom OCR xuống dòng dựa trên y
+                 line_groups = []
+                 current_group = [items_in_cell[0]]
+                 for it in items_in_cell[1:]:
+                     if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5:  # threshold 5 px
+                         line_groups.append(current_group)
+                         current_group = [it]
+                     else:
+                         current_group.append(it)
+                 line_groups.append(current_group)
+ 
+                 # Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
+                 multi_line = len(line_groups) > 1
+                 for sub_i, group in enumerate(line_groups, start=1):
+                     for it in group:
+                         cell_texts.append(it["text"])
+                         it["table"] = {
                             "bbox": {
-                             "table_index": index,
+                                 "table_index": table_index,
                                 "row_idx": cell["row_idx"],
-                             "col_idx": cell["col_idx"]
+                                 "col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
                             }
                         }
 
-             # thêm vào cell gốc
+             # Lưu text gộp trong cell
             cell["texts"] = cell_texts
             cell["text"] = " ".join(cell_texts)
 # ==== Build JSON ====