Showing
1 changed file
with
18 additions
and
17 deletions
| ... | @@ -141,20 +141,7 @@ for res in results: | ... | @@ -141,20 +141,7 @@ for res in results: |
| 141 | "field": "", | 141 | "field": "", |
| 142 | "hideBorder": False | 142 | "hideBorder": False |
| 143 | }) | 143 | }) |
| 144 | - | 144 | +def overlap_ratio(bbox, cell_box): |
| 145 | -# ==== Detect table ==== | ||
| 146 | -if debug_file and os.path.exists(debug_file): | ||
| 147 | - image_path = debug_file | ||
| 148 | -table_info = detect_tables(image_path) | ||
| 149 | - | ||
| 150 | -for table in table_info: | ||
| 151 | - for row in table["cells"]: # row là list các cell dict | ||
| 152 | - for cell in row: | ||
| 153 | - x1, y1, x2, y2 = cell["cell"] | ||
| 154 | - cell_texts = [] | ||
| 155 | - | ||
| 156 | - # Helper: compute overlap ratio of bbox against cell | ||
| 157 | - def overlap_ratio(bbox, cell_box): | ||
| 158 | ix1 = max(bbox[0], cell_box[0]) | 145 | ix1 = max(bbox[0], cell_box[0]) |
| 159 | iy1 = max(bbox[1], cell_box[1]) | 146 | iy1 = max(bbox[1], cell_box[1]) |
| 160 | ix2 = min(bbox[2], cell_box[2]) | 147 | ix2 = min(bbox[2], cell_box[2]) |
| ... | @@ -164,14 +151,21 @@ for table in table_info: | ... | @@ -164,14 +151,21 @@ for table in table_info: |
| 164 | inter = iw * ih | 151 | inter = iw * ih |
| 165 | bbox_area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])) | 152 | bbox_area = max(1, (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])) |
| 166 | return inter / float(bbox_area) | 153 | return inter / float(bbox_area) |
| 167 | - | 154 | +def center_inside(bbox, cell_box): |
| 168 | - # Helper: check center inside cell | ||
| 169 | - def center_inside(bbox, cell_box): | ||
| 170 | cx = (bbox[0] + bbox[2]) / 2.0 | 155 | cx = (bbox[0] + bbox[2]) / 2.0 |
| 171 | cy = (bbox[1] + bbox[3]) / 2.0 | 156 | cy = (bbox[1] + bbox[3]) / 2.0 |
| 172 | return (cx >= cell_box[0] and cx <= cell_box[2] and | 157 | return (cx >= cell_box[0] and cx <= cell_box[2] and |
| 173 | cy >= cell_box[1] and cy <= cell_box[3]) | 158 | cy >= cell_box[1] and cy <= cell_box[3]) |
| 159 | +# ==== Detect table ==== | ||
| 160 | +if debug_file and os.path.exists(debug_file): | ||
| 161 | + image_path = debug_file | ||
| 162 | +table_info = detect_tables(image_path) | ||
| 174 | 163 | ||
| 164 | +for index, table in enumerate(table_info): | ||
| 165 | + for row in table["cells"]: # row là list các cell dict | ||
| 166 | + for cell in row: | ||
| 167 | + x1, y1, x2, y2 = cell["cell"] | ||
| 168 | + cell_texts = [] | ||
| 175 | cell_box = [x1, y1, x2, y2] | 169 | cell_box = [x1, y1, x2, y2] |
| 176 | for item in ocr_data_list: | 170 | for item in ocr_data_list: |
| 177 | bx1, by1, bx2, by2 = item["bbox"] | 171 | bx1, by1, bx2, by2 = item["bbox"] |
| ... | @@ -179,6 +173,13 @@ for table in table_info: | ... | @@ -179,6 +173,13 @@ for table in table_info: |
| 179 | # Accept if bbox is largely inside the cell, or its center lies inside the cell | 173 | # Accept if bbox is largely inside the cell, or its center lies inside the cell |
| 180 | if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box): | 174 | if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box): |
| 181 | cell_texts.append(item["text"]) | 175 | cell_texts.append(item["text"]) |
| 176 | + item["table"] = { | ||
| 177 | + "bbox": { | ||
| 178 | + "table_index": index, | ||
| 179 | + "row_idx": cell["row_idx"], | ||
| 180 | + "col_idx": cell["col_idx"] | ||
| 181 | + } | ||
| 182 | + } | ||
| 182 | 183 | ||
| 183 | # thêm vào cell gốc | 184 | # thêm vào cell gốc |
| 184 | cell["texts"] = cell_texts | 185 | cell["texts"] = cell_texts | ... | ... |
-
Please register or sign in to post a comment