tien_nemo

nếu chữ mảnh tăng đậm hơn

...@@ -7,17 +7,18 @@ import json ...@@ -7,17 +7,18 @@ import json
7 from pathlib import Path 7 from pathlib import Path
8 import cv2 8 import cv2
9 from table_detector import detect_tables 9 from table_detector import detect_tables
10 +from PIL import Image, ImageEnhance
10 11
11 # ==== Config ==== 12 # ==== Config ====
12 BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) 13 BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
13 -PDF_NAME = 'aaaa' 14 +# PDF_NAME = 'aaaa'
14 15
15 # PDF path 16 # PDF path
16 -pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "fax.pdf" 17 +pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "Iwasaki_1.pdf"
17 # Output folder 18 # Output folder
18 output_folder = Path(BASE_DIR) / "public" / "image" 19 output_folder = Path(BASE_DIR) / "public" / "image"
19 20
20 -#PDF_NAME = pdf_path.stem # Get the stem of the PDF file 21 +PDF_NAME = pdf_path.stem # Get the stem of the PDF file
21 #print(PDF_NAME) 22 #print(PDF_NAME)
22 23
23 os.makedirs(output_folder, exist_ok=True) 24 os.makedirs(output_folder, exist_ok=True)
...@@ -39,8 +40,89 @@ pages[0].save(image_path, "JPEG") ...@@ -39,8 +40,89 @@ pages[0].save(image_path, "JPEG")
39 40
40 # ==== Run OCR ==== 41 # ==== Run OCR ====
41 image_np = np.array(pages[0]) 42 image_np = np.array(pages[0])
42 -results = ocr.predict(image_np)
43 43
44 +def estimate_text_ratio(gray, block_size=256):
45 + """Tính median text_ratio theo block nhỏ"""
46 + h, w = gray.shape
47 + ratios = []
48 + for y in range(0, h, block_size):
49 + for x in range(0, w, block_size):
50 + block = gray[y:y+block_size, x:x+block_size]
51 + if block.size == 0:
52 + continue
53 + _, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
54 + text_mask = 255 - binary
55 + ratio = np.sum(text_mask > 0) / text_mask.size
56 + ratios.append(ratio)
57 +
58 + if len(ratios) == 0:
59 + return 0.0
60 + return np.median(ratios) # trung vị để tránh bị outlier
61 +
62 +def bolden_text(rgb_img: np.ndarray,
63 + kernel_size: int = 3,
64 + iterations: int = 1,
65 + contrast: float = 1.5,
66 + sharpness: float = 1.2) -> np.ndarray:
67 + """
68 + Làm đậm chữ trong ảnh RGB:
69 + - kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
70 + - iterations: số lần dilate
71 + - contrast: hệ số tăng tương phản (>=1.0)
72 + - sharpness: hệ số tăng nét (>=1.0)
73 + """
74 + # RGB -> Gray
75 + gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
76 +
77 + # Nhị phân Otsu
78 + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
79 +
80 + # Text mask: chữ = 255
81 + threshold = 0.02
82 + text_ratio = estimate_text_ratio(gray, block_size=256)
83 + print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")
84 +
85 + if text_ratio > threshold:
86 + return rgb_img
87 +
88 + # Dilation
89 + text_mask = 255 - binary
90 + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
91 + bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)
92 +
93 + # Overlay lên gray gốc
94 + inv_gray = 255 - gray
95 + inv_gray_boost = np.maximum(inv_gray, bold_mask)
96 + out_gray = 255 - inv_gray_boost
97 +
98 + # Tăng contrast (linear scale)
99 + out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)
100 +
101 + # Tăng sharpness bằng unsharp mask
102 + blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
103 + out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)
104 +
105 + # Trả về RGB cho PaddleOCR
106 + out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)
107 + return out_rgb
108 +
109 +preprocessed = bolden_text(
110 + image_np,
111 + kernel_size=3, # tăng lên 3 nếu chữ vẫn mảnh
112 + iterations=1, # tăng lên 2 nếu muốn đậm hơn
113 + contrast=1.5, # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
114 + sharpness=1.2 # >1.0 để nét hơn
115 +)
116 +
117 +debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
118 +cv2.imwrite(debug_path, cv2.cvtColor(preprocessed, cv2.COLOR_RGB2BGR))
119 +print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
120 +
121 +# Gọi OCR (đảm bảo 3 kênh)
122 +if preprocessed.ndim == 2:
123 + preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)
124 +
125 +results = ocr.predict(preprocessed)
44 # ==== Convert polygon to bbox ==== 126 # ==== Convert polygon to bbox ====
45 def poly_to_bbox(poly): 127 def poly_to_bbox(poly):
46 xs = [p[0] for p in poly] 128 xs = [p[0] for p in poly]
......