222

tien_nemo
Commit 0513e5e401b59142bd9594953c14e4e48733abba 0513e5e4 1 parent 644aced6
Showing 1 changed file with 35 additions and 12 deletions
app/Services/OCR/extrac_table.py
--- a/app/Services/OCR/extrac_table.py
View file @0513e5e
+++ b/app/Services/OCR/extrac_table.py
View file @0513e5e
@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file):
     image_path = debug_file
 table_info = detect_tables(image_path)
-for index, table in enumerate(table_info):
+for table_index, table in enumerate(table_info):
     for row in table["cells"]:  # row là list các cell dict
         for cell in row:
             x1, y1, x2, y2 = cell["cell"]
-            cell_texts = []
             cell_box = [x1, y1, x2, y2]
-            for item in ocr_data_list:
+
-                bx1, by1, bx2, by2 = item["bbox"]
+            # Lọc OCR nằm trong ô
-                bbox = [bx1, by1, bx2, by2]
+            items_in_cell = [
-                # Accept if bbox is largely inside the cell, or its center lies inside the cell
+                item for item in ocr_data_list
-                if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box):
+                if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
-                    cell_texts.append(item["text"])
+            ]
-                    item["table"] = {
+
+            # Danh sách text trong cell
+            cell_texts = []
+
+            if items_in_cell:
+                # Sắp xếp OCR theo y rồi x
+                items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))
+
+                # Gom OCR xuống dòng dựa trên y
+                line_groups = []
+                current_group = [items_in_cell[0]]
+                for it in items_in_cell[1:]:
+                    if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5:  # threshold 5 px
+                        line_groups.append(current_group)
+                        current_group = [it]
+                    else:
+                        current_group.append(it)
+                line_groups.append(current_group)
+
+                # Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
+                multi_line = len(line_groups) > 1
+                for sub_i, group in enumerate(line_groups, start=1):
+                    for it in group:
+                        cell_texts.append(it["text"])
+                        it["table"] = {
                             "bbox": {
-                            "table_index": index,
+                                "table_index": table_index,
                                 "row_idx": cell["row_idx"],
-                            "col_idx": cell["col_idx"]
+                                "col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
                             }
                         }
-            # thêm vào cell gốc
+            # Lưu text gộp trong cell
             cell["texts"] = cell_texts
             cell["text"] = " ".join(cell_texts)
 # ==== Build JSON ====