Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Satini_pvduc
/
ocrpdf
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Authored by
tien_nemo
2025-08-28 11:27:37 +0700
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Commit
dc663513b1a4b35a4ac3d2cf2031a54bf7fa5e6c
dc663513
1 parent
cade3f2a
test1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
156 additions
and
0 deletions
app/Services/OCR/extrac_table.py
app/Services/OCR/table_detector.py
app/Services/OCR/extrac_table.py
0 → 100644
View file @
dc66351
from
paddleocr
import
PaddleOCR
from
pdf2image
import
convert_from_path
import
os
import
time
import
numpy
as
np
import
json
from
pathlib
import
Path
import
cv2
from
table_detector
import
detect_tables
# ==== Config ====
BASE_DIR
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
,
".."
))
PDF_NAME
=
'aaaa'
# PDF path
pdf_path
=
Path
(
BASE_DIR
)
/
"storage"
/
"pdf"
/
"fax.pdf"
# Output folder
output_folder
=
Path
(
BASE_DIR
)
/
"public"
/
"image"
#PDF_NAME = pdf_path.stem # Get the stem of the PDF file
#print(PDF_NAME)
os
.
makedirs
(
output_folder
,
exist_ok
=
True
)
timestamp
=
int
(
time
.
time
())
img_base_name
=
f
"{PDF_NAME}_{timestamp}"
# ==== OCR Init ====
ocr
=
PaddleOCR
(
use_doc_orientation_classify
=
False
,
use_doc_unwarping
=
False
,
use_textline_orientation
=
False
)
# ==== PDF to Image ====
pages
=
convert_from_path
(
pdf_path
,
first_page
=
1
,
last_page
=
1
)
image_path
=
os
.
path
.
join
(
output_folder
,
f
"{img_base_name}.jpg"
)
pages
[
0
]
.
save
(
image_path
,
"JPEG"
)
# ==== Run OCR ====
image_np
=
np
.
array
(
pages
[
0
])
results
=
ocr
.
predict
(
image_np
)
# ==== Convert polygon to bbox ====
def
poly_to_bbox
(
poly
):
xs
=
[
p
[
0
]
for
p
in
poly
]
ys
=
[
p
[
1
]
for
p
in
poly
]
return
[
int
(
min
(
xs
)),
int
(
min
(
ys
)),
int
(
max
(
xs
)),
int
(
max
(
ys
))]
# ==== Build ocrData ====
ocr_data_list
=
[]
for
res
in
results
:
for
text
,
poly
in
zip
(
res
[
'rec_texts'
],
res
[
'rec_polys'
]):
bbox
=
poly_to_bbox
(
poly
)
ocr_data_list
.
append
({
"text"
:
text
,
"bbox"
:
bbox
,
"field"
:
""
,
"hideBorder"
:
False
})
# ==== Detect table ====
table_info
=
detect_tables
(
image_path
)
# ==== Build JSON ====
final_json
=
{
"ocr_data"
:
ocr_data_list
,
"tables"
:
table_info
}
# ==== Save JSON ====
json_path
=
os
.
path
.
join
(
output_folder
,
f
"{PDF_NAME}_{timestamp}_with_table.json"
)
with
open
(
json_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
final_json
,
f
,
ensure_ascii
=
False
,
indent
=
2
)
print
(
f
"Saved OCR + Table JSON to: {json_path}"
)
app/Services/OCR/table_detector.py
0 → 100644
View file @
dc66351
import
cv2
import
numpy
as
np
import
os
def
detect_tables
(
image_path
):
img
=
cv2
.
imread
(
image_path
)
if
img
is
None
:
raise
FileNotFoundError
(
f
"Không đọc được ảnh: {image_path}"
)
gray
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2GRAY
)
blur
=
cv2
.
GaussianBlur
(
gray
,
(
3
,
3
),
0
)
# Edge detection
edges
=
cv2
.
Canny
(
blur
,
50
,
150
,
apertureSize
=
3
)
# --- Horizontal lines ---
lines_h
=
cv2
.
HoughLinesP
(
edges
,
1
,
np
.
pi
/
180
,
threshold
=
120
,
minLineLength
=
int
(
img
.
shape
[
1
]
*
0.6
),
maxLineGap
=
20
)
ys_candidates
,
line_segments
=
[],
[]
if
lines_h
is
not
None
:
for
l
in
lines_h
:
x1
,
y1
,
x2
,
y2
=
l
[
0
]
if
abs
(
y1
-
y2
)
<=
3
:
# ngang
y_mid
=
int
(
round
((
y1
+
y2
)
/
2
))
ys_candidates
.
append
(
y_mid
)
line_segments
.
append
((
x1
,
x2
,
y_mid
))
# gom nhóm các y
ys
,
tol_y
=
[],
10
for
y
in
sorted
(
ys_candidates
):
if
not
ys
or
abs
(
y
-
ys
[
-
1
])
>
tol_y
:
ys
.
append
(
y
)
total_rows
=
max
(
0
,
len
(
ys
)
-
1
)
# --- Vertical lines ---
lines_v
=
cv2
.
HoughLinesP
(
edges
,
1
,
np
.
pi
/
180
,
threshold
=
100
,
minLineLength
=
int
(
img
.
shape
[
0
]
*
0.5
),
maxLineGap
=
20
)
xs
=
[]
if
lines_v
is
not
None
:
for
l
in
lines_v
:
x1
,
y1
,
x2
,
y2
=
l
[
0
]
if
abs
(
x1
-
x2
)
<=
3
:
xs
.
append
(
int
(
round
((
x1
+
x2
)
/
2
)))
# gom nhóm cột
x_pos
,
tol_v
=
[],
10
for
v
in
sorted
(
xs
):
if
not
x_pos
or
v
-
x_pos
[
-
1
]
>
tol_v
:
x_pos
.
append
(
v
)
total_cols
=
max
(
0
,
len
(
x_pos
)
-
1
)
tables
=
[]
if
len
(
ys
)
>=
3
and
line_segments
:
y_min
,
y_max
=
ys
[
0
],
ys
[
-
1
]
min_x
=
min
(
seg
[
0
]
for
seg
in
line_segments
)
max_x
=
max
(
seg
[
1
]
for
seg
in
line_segments
)
table_box
=
(
min_x
,
y_min
,
max_x
,
y_max
)
rows
=
[]
for
i
in
range
(
len
(
ys
)
-
1
):
row_box
=
(
min_x
,
ys
[
i
],
max_x
,
ys
[
i
+
1
])
rows
.
append
({
"row"
:
tuple
(
int
(
v
)
for
v
in
row_box
)})
cv2
.
rectangle
(
img
,
(
row_box
[
0
],
row_box
[
1
]),
(
row_box
[
2
],
row_box
[
3
]),
(
0
,
255
,
255
),
2
)
tables
.
append
({
"total_rows"
:
int
(
total_rows
),
"total_cols"
:
int
(
total_cols
),
"table_box"
:
tuple
(
int
(
v
)
for
v
in
table_box
),
"rows_box"
:
rows
})
cv2
.
rectangle
(
img
,
(
min_x
,
y_min
),
(
max_x
,
y_max
),
(
255
,
0
,
0
),
3
)
debug_path
=
os
.
path
.
splitext
(
image_path
)[
0
]
+
"_debug.jpg"
cv2
.
imwrite
(
debug_path
,
img
)
return
tables
Please
register
or
sign in
to post a comment