tien_nemo

222

......@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file):
image_path = debug_file
table_info = detect_tables(image_path)
for index, table in enumerate(table_info):
for table_index, table in enumerate(table_info):
for row in table["cells"]: # row là list các cell dict
for cell in row:
x1, y1, x2, y2 = cell["cell"]
cell_texts = []
cell_box = [x1, y1, x2, y2]
for item in ocr_data_list:
bx1, by1, bx2, by2 = item["bbox"]
bbox = [bx1, by1, bx2, by2]
# Accept if bbox is largely inside the cell, or its center lies inside the cell
if overlap_ratio(bbox, cell_box) >= 0.3 or center_inside(bbox, cell_box):
cell_texts.append(item["text"])
item["table"] = {
# Lọc OCR nằm trong ô
items_in_cell = [
item for item in ocr_data_list
if overlap_ratio(item["bbox"], cell_box) >= 0.3 or center_inside(item["bbox"], cell_box)
]
# Danh sách text trong cell
cell_texts = []
if items_in_cell:
# Sắp xếp OCR theo y rồi x
items_in_cell.sort(key=lambda it: (it["bbox"][1], it["bbox"][0]))
# Gom OCR xuống dòng dựa trên y
line_groups = []
current_group = [items_in_cell[0]]
for it in items_in_cell[1:]:
if abs(it["bbox"][1] - current_group[-1]["bbox"][1]) > 5: # threshold 5 px
line_groups.append(current_group)
current_group = [it]
else:
current_group.append(it)
line_groups.append(current_group)
# Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
multi_line = len(line_groups) > 1
for sub_i, group in enumerate(line_groups, start=1):
for it in group:
cell_texts.append(it["text"])
it["table"] = {
"bbox": {
"table_index": index,
"table_index": table_index,
"row_idx": cell["row_idx"],
"col_idx": cell["col_idx"]
"col_idx": f"{cell['col_idx']}_{sub_i}" if multi_line else cell["col_idx"]
}
}
# thêm vào cell gốc
# Lưu text gộp trong cell
cell["texts"] = cell_texts
cell["text"] = " ".join(cell_texts)
# ==== Build JSON ====
......