Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Satini_pvduc
/
ocrpdf
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Authored by
tien_nemo
2025-09-25 10:42:35 +0700
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Commit
0513e5e401b59142bd9594953c14e4e48733abba
0513e5e4
1 parent
644aced6
222
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
38 additions
and
15 deletions
app/Services/OCR/extrac_table.py
app/Services/OCR/extrac_table.py
View file @
0513e5e
...
...
@@ -161,27 +161,50 @@ if debug_file and os.path.exists(debug_file):
image_path
=
debug_file
table_info
=
detect_tables
(
image_path
)
for
index
,
table
in
enumerate
(
table_info
):
for
table_
index
,
table
in
enumerate
(
table_info
):
for
row
in
table
[
"cells"
]:
# row là list các cell dict
for
cell
in
row
:
x1
,
y1
,
x2
,
y2
=
cell
[
"cell"
]
cell_texts
=
[]
cell_box
=
[
x1
,
y1
,
x2
,
y2
]
for
item
in
ocr_data_list
:
bx1
,
by1
,
bx2
,
by2
=
item
[
"bbox"
]
bbox
=
[
bx1
,
by1
,
bx2
,
by2
]
# Accept if bbox is largely inside the cell, or its center lies inside the cell
if
overlap_ratio
(
bbox
,
cell_box
)
>=
0.3
or
center_inside
(
bbox
,
cell_box
):
cell_texts
.
append
(
item
[
"text"
])
item
[
"table"
]
=
{
"bbox"
:
{
"table_index"
:
index
,
"row_idx"
:
cell
[
"row_idx"
],
"col_idx"
:
cell
[
"col_idx"
]
# Lọc OCR nằm trong ô
items_in_cell
=
[
item
for
item
in
ocr_data_list
if
overlap_ratio
(
item
[
"bbox"
],
cell_box
)
>=
0.3
or
center_inside
(
item
[
"bbox"
],
cell_box
)
]
# Danh sách text trong cell
cell_texts
=
[]
if
items_in_cell
:
# Sắp xếp OCR theo y rồi x
items_in_cell
.
sort
(
key
=
lambda
it
:
(
it
[
"bbox"
][
1
],
it
[
"bbox"
][
0
]))
# Gom OCR xuống dòng dựa trên y
line_groups
=
[]
current_group
=
[
items_in_cell
[
0
]]
for
it
in
items_in_cell
[
1
:]:
if
abs
(
it
[
"bbox"
][
1
]
-
current_group
[
-
1
][
"bbox"
][
1
])
>
5
:
# threshold 5 px
line_groups
.
append
(
current_group
)
current_group
=
[
it
]
else
:
current_group
.
append
(
it
)
line_groups
.
append
(
current_group
)
# Gán col_idx hậu tố cho OCR trong mỗi dòng và lưu text
multi_line
=
len
(
line_groups
)
>
1
for
sub_i
,
group
in
enumerate
(
line_groups
,
start
=
1
):
for
it
in
group
:
cell_texts
.
append
(
it
[
"text"
])
it
[
"table"
]
=
{
"bbox"
:
{
"table_index"
:
table_index
,
"row_idx"
:
cell
[
"row_idx"
],
"col_idx"
:
f
"{cell['col_idx']}_{sub_i}"
if
multi_line
else
cell
[
"col_idx"
]
}
}
}
#
thêm vào cell gốc
#
Lưu text gộp trong cell
cell
[
"texts"
]
=
cell_texts
cell
[
"text"
]
=
" "
.
join
(
cell_texts
)
# ==== Build JSON ====
...
...
Please
register
or
sign in
to post a comment