Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Satini_pvduc
/
ocrpdf
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Authored by
tien_nemo
2025-09-23 08:14:21 +0700
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Commit
644aced63246d95059c7f6ed370d636717d57df6
644aced6
1 parent
ef142bf1
fix anh dung keu 22/09
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
22 deletions
app/Services/OCR/extrac_table.py
app/Services/OCR/extrac_table.py
View file @
644aced
...
...
@@ -141,37 +141,31 @@ for res in results:
"field"
:
""
,
"hideBorder"
:
False
})
def
overlap_ratio
(
bbox
,
cell_box
):
ix1
=
max
(
bbox
[
0
],
cell_box
[
0
])
iy1
=
max
(
bbox
[
1
],
cell_box
[
1
])
ix2
=
min
(
bbox
[
2
],
cell_box
[
2
])
iy2
=
min
(
bbox
[
3
],
cell_box
[
3
])
iw
=
max
(
0
,
ix2
-
ix1
)
ih
=
max
(
0
,
iy2
-
iy1
)
inter
=
iw
*
ih
bbox_area
=
max
(
1
,
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
]))
return
inter
/
float
(
bbox_area
)
def
center_inside
(
bbox
,
cell_box
):
cx
=
(
bbox
[
0
]
+
bbox
[
2
])
/
2.0
cy
=
(
bbox
[
1
]
+
bbox
[
3
])
/
2.0
return
(
cx
>=
cell_box
[
0
]
and
cx
<=
cell_box
[
2
]
and
cy
>=
cell_box
[
1
]
and
cy
<=
cell_box
[
3
])
# ==== Detect table ====
if
debug_file
and
os
.
path
.
exists
(
debug_file
):
image_path
=
debug_file
table_info
=
detect_tables
(
image_path
)
for
table
in
table_info
:
for
index
,
table
in
enumerate
(
table_info
)
:
for
row
in
table
[
"cells"
]:
# row là list các cell dict
for
cell
in
row
:
x1
,
y1
,
x2
,
y2
=
cell
[
"cell"
]
cell_texts
=
[]
# Helper: compute overlap ratio of bbox against cell
def
overlap_ratio
(
bbox
,
cell_box
):
ix1
=
max
(
bbox
[
0
],
cell_box
[
0
])
iy1
=
max
(
bbox
[
1
],
cell_box
[
1
])
ix2
=
min
(
bbox
[
2
],
cell_box
[
2
])
iy2
=
min
(
bbox
[
3
],
cell_box
[
3
])
iw
=
max
(
0
,
ix2
-
ix1
)
ih
=
max
(
0
,
iy2
-
iy1
)
inter
=
iw
*
ih
bbox_area
=
max
(
1
,
(
bbox
[
2
]
-
bbox
[
0
])
*
(
bbox
[
3
]
-
bbox
[
1
]))
return
inter
/
float
(
bbox_area
)
# Helper: check center inside cell
def
center_inside
(
bbox
,
cell_box
):
cx
=
(
bbox
[
0
]
+
bbox
[
2
])
/
2.0
cy
=
(
bbox
[
1
]
+
bbox
[
3
])
/
2.0
return
(
cx
>=
cell_box
[
0
]
and
cx
<=
cell_box
[
2
]
and
cy
>=
cell_box
[
1
]
and
cy
<=
cell_box
[
3
])
cell_box
=
[
x1
,
y1
,
x2
,
y2
]
for
item
in
ocr_data_list
:
bx1
,
by1
,
bx2
,
by2
=
item
[
"bbox"
]
...
...
@@ -179,6 +173,13 @@ for table in table_info:
# Accept if bbox is largely inside the cell, or its center lies inside the cell
if
overlap_ratio
(
bbox
,
cell_box
)
>=
0.3
or
center_inside
(
bbox
,
cell_box
):
cell_texts
.
append
(
item
[
"text"
])
item
[
"table"
]
=
{
"bbox"
:
{
"table_index"
:
index
,
"row_idx"
:
cell
[
"row_idx"
],
"col_idx"
:
cell
[
"col_idx"
]
}
}
# thêm vào cell gốc
cell
[
"texts"
]
=
cell_texts
...
...
Please
register
or
sign in
to post a comment