read_pdf.py
1.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import os
import time
import numpy as np
import json
# ==== Config ====
pdf_path = "D:/Learning_Tien/OCR/PaddleOCR/pdf/data_picking_detail.pdf"
output_folder = "D:/Learning_Tien/OCR/ocr-mapping/public/image"
os.makedirs(output_folder, exist_ok=True)
pdf_name = "data_picking_detail"
timestamp = int(time.time())
img_base_name = f"{pdf_name}_{timestamp}"
# ==== OCR Init ====
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False
)
# ==== PDF to Image ====
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
image_path = os.path.join(output_folder, f"{img_base_name}.jpg")
pages[0].save(image_path, "JPEG")
# ==== Run OCR ====
image_np = np.array(pages[0])
results = ocr.predict(image_np)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
xs = [p[0] for p in poly]
ys = [p[1] for p in poly]
return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]
# ==== Build ocrData ====
ocr_data_list = []
for res in results:
for text, poly in zip(res['rec_texts'], res['rec_polys']):
bbox = poly_to_bbox(poly)
ocr_data_list.append({
"text": text,
"bbox": bbox,
"field": "",
"hideBorder": False
})
# ==== Save JSON ====
json_path = os.path.join(output_folder, f"{pdf_name}_{timestamp}.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(ocr_data_list, f, ensure_ascii=False, indent=2)
print(f"Saved OCR data JSON to: {json_path}")