tien_nemo

222

...@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file): ...@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file):
161 image_path = debug_file 161 image_path = debug_file
162 table_info = detect_tables(image_path) 162 table_info = detect_tables(image_path)
163 163
164 -for index, table in enumerate(table_info): 164 +for table_index, table in enumerate(table_info):
165 for row in table["cells"]: # row là list các cell dict 165 for row in table["cells"]: # row là list các cell dict
166 for cell in row: 166 for cell in row:
167 x1, y1, x2, y2 = cell["cell"] 167 x1, y1, x2, y2 = cell["cell"]
168 - cell_texts = []
169 cell_box = [x1, y1, x2, y2] 168 cell_box = [x1, y1, x2, y2]
170 - for item in ocr_data_list: 169 +
171 - bx1, by1, bx2, by2 = item["bbox"] 170 + # Lọc OCR nằm trong ô
172 - bbox = [bx1, by1, bx2, by2] 171 + items_in_cell = [
173 - # Accept if bbox is largely inside the cell, or its center lies inside the cell 172 + item for item in ocr_data_list
174 - if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box): 173 + if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
175 - cell_texts.append(item["text"]) 174 + ]
176 - item["table"] = { 175 +
177 - "bbox": { 176 + # Danh sách text trong cell
178 - "table_index": index, 177 + cell_texts = []
179 - "row_idx": cell["row_idx"], 178 +
180 - "col_idx": cell["col_idx"] 179 + if items_in_cell:
180 + # Sắp xếp OCR theo y rồi x
181 + items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))
182 +
183 + # Gom OCR xuống dòng dựa trên y
184 + line_groups = []
185 + current_group = [items_in_cell[0]]
186 + for it in items_in_cell[1:]:
187 + if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5: # threshold 5 px
188 + line_groups.append(current_group)
189 + current_group = [it]
190 + else:
191 + current_group.append(it)
192 + line_groups.append(current_group)
193 +
194 + # Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
195 + multi_line = len(line_groups) > 1
196 + for sub_i, group in enumerate(line_groups, start=1):
197 + for it in group:
198 + cell_texts.append(it["text"])
199 + it["table"] = {
200 + "bbox": {
201 + "table_index": table_index,
202 + "row_idx": cell["row_idx"],
203 + "col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
204 + }
181 } 205 }
182 - }
183 206
184 - # thêm vào cell gốc 207 + # Lưu text gộp trong cell
185 cell["texts"] = cell_texts 208 cell["texts"] = cell_texts
186 cell["text"] = " ".join(cell_texts) 209 cell["text"] = " ".join(cell_texts)
187 # ==== Build JSON ==== 210 # ==== Build JSON ====
......