Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Satini_pvduc
/
ocrpdf
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Authored by
tien_nemo
2025-09-16 09:42:26 +0700
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Commit
ef142bf18d76262349adc984f42bb69a188ce9fb
ef142bf1
1 parent
57efd5e3
fix det table
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
6 deletions
app/Services/OCR/extrac_table.py
app/Services/OCR/extrac_table.py
View file @
ef142bf
...
...
@@ -14,7 +14,7 @@ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "
PDF_NAME
=
'nemo_new'
# PDF path
pdf_path
=
Path
(
BASE_DIR
)
/
"storage"
/
"pdf"
/
"
2
.pdf"
pdf_path
=
Path
(
BASE_DIR
)
/
"storage"
/
"pdf"
/
"
Iwasaki_1
.pdf"
# Output folder
output_folder
=
Path
(
BASE_DIR
)
/
"public"
/
"image"
...
...
@@ -81,9 +81,9 @@ def bolden_text(rgb_img: np.ndarray,
threshold
=
0.02
text_ratio
=
estimate_text_ratio
(
gray
,
block_size
=
256
)
print
(
f
"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}"
)
debug_path
=
os
.
path
.
join
(
output_folder
,
f
"{img_base_name}_preprocessed_debug.jpg"
)
if
text_ratio
>
threshold
:
return
rgb_img
return
rgb_img
,
debug_path
# Dilation
text_mask
=
255
-
binary
...
...
@@ -104,12 +104,12 @@ def bolden_text(rgb_img: np.ndarray,
# Trả về RGB cho PaddleOCR
out_rgb
=
cv2
.
cvtColor
(
out_gray
,
cv2
.
COLOR_GRAY2RGB
)
debug_path
=
os
.
path
.
join
(
output_folder
,
f
"{img_base_name}_preprocessed_debug.jpg"
)
cv2
.
imwrite
(
debug_path
,
cv2
.
cvtColor
(
out_rgb
,
cv2
.
COLOR_RGB2BGR
))
print
(
f
"[DEBUG] Preprocessed image saved to: {debug_path}"
)
return
out_rgb
return
out_rgb
,
debug_path
preprocessed
=
bolden_text
(
preprocessed
,
debug_file
=
bolden_text
(
image_np
,
kernel_size
=
3
,
# tăng lên 3 nếu chữ vẫn mảnh
iterations
=
1
,
# tăng lên 2 nếu muốn đậm hơn
...
...
@@ -143,6 +143,8 @@ for res in results:
})
# ==== Detect table ====
if
debug_file
and
os
.
path
.
exists
(
debug_file
):
image_path
=
debug_file
table_info
=
detect_tables
(
image_path
)
for
table
in
table_info
:
...
...
Please
register
or
sign in
to post a comment