Showing
1 changed file
with
35 additions
and
12 deletions
| ... | @@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file): | ... | @@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file): |
| 161 | image_path = debug_file | 161 | image_path = debug_file |
| 162 | table_info = detect_tables(image_path) | 162 | table_info = detect_tables(image_path) |
| 163 | 163 | ||
| 164 | -for index, table in enumerate(table_info): | 164 | +for table_index, table in enumerate(table_info): |
| 165 | for row in table["cells"]: # row là list các cell dict | 165 | for row in table["cells"]: # row là list các cell dict |
| 166 | for cell in row: | 166 | for cell in row: |
| 167 | x1, y1, x2, y2 = cell["cell"] | 167 | x1, y1, x2, y2 = cell["cell"] |
| 168 | - cell_texts = [] | ||
| 169 | cell_box = [x1, y1, x2, y2] | 168 | cell_box = [x1, y1, x2, y2] |
| 170 | - for item in ocr_data_list: | 169 | + |
| 171 | - bx1, by1, bx2, by2 = item["bbox"] | 170 | + # Lọc OCR nằm trong ô |
| 172 | - bbox = [bx1, by1, bx2, by2] | 171 | + items_in_cell = [ |
| 173 | - # Accept if bbox is largely inside the cell, or its center lies inside the cell | 172 | + item for item in ocr_data_list |
| 174 | - if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box): | 173 | + if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box) |
| 175 | - cell_texts.append(item["text"]) | 174 | + ] |
| 176 | - item["table"] = { | 175 | + |
| 176 | + # Danh sách text trong cell | ||
| 177 | + cell_texts = [] | ||
| 178 | + | ||
| 179 | + if items_in_cell: | ||
| 180 | + # Sắp xếp OCR theo y rồi x | ||
| 181 | + items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0])) | ||
| 182 | + | ||
| 183 | + # Gom OCR xuống dòng dựa trên y | ||
| 184 | + line_groups = [] | ||
| 185 | + current_group = [items_in_cell[0]] | ||
| 186 | + for it in items_in_cell[1:]: | ||
| 187 | + if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5: # threshold 5 px | ||
| 188 | + line_groups.append(current_group) | ||
| 189 | + current_group = [it] | ||
| 190 | + else: | ||
| 191 | + current_group.append(it) | ||
| 192 | + line_groups.append(current_group) | ||
| 193 | + | ||
| 194 | + # Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text | ||
| 195 | + multi_line = len(line_groups) > 1 | ||
| 196 | + for sub_i, group in enumerate(line_groups, start=1): | ||
| 197 | + for it in group: | ||
| 198 | + cell_texts.append(it["text"]) | ||
| 199 | + it["table"] = { | ||
| 177 | "bbox": { | 200 | "bbox": { |
| 178 | - "table_index": index, | 201 | + "table_index": table_index, |
| 179 | "row_idx": cell["row_idx"], | 202 | "row_idx": cell["row_idx"], |
| 180 | - "col_idx": cell["col_idx"] | 203 | + "col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"] |
| 181 | } | 204 | } |
| 182 | } | 205 | } |
| 183 | 206 | ||
| 184 | - # thêm vào cell gốc | 207 | + # Lưu text gộp trong cell |
| 185 | cell["texts"] = cell_texts | 208 | cell["texts"] = cell_texts |
| 186 | cell["text"] = " ".join(cell_texts) | 209 | cell["text"] = " ".join(cell_texts) |
| 187 | # ==== Build JSON ==== | 210 | # ==== Build JSON ==== | ... | ... |
-
Please register or sign in to post a comment