tien_nemo

nếu chữ mảnh tăng đậm hơn

......@@ -7,17 +7,18 @@ import json
from pathlib import Path
import cv2
from table_detector import detect_tables
from PIL import Image, ImageEnhance
# ==== Config ====
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
PDF_NAME = 'aaaa'
# PDF_NAME = 'aaaa'
# PDF path
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "fax.pdf"
pdf_path = Path(BASE_DIR) / "storage" / "pdf" / "Iwasaki_1.pdf"
# Output folder
output_folder = Path(BASE_DIR) / "public" / "image"
#PDF_NAME = pdf_path.stem # Get the stem of the PDF file
PDF_NAME = pdf_path.stem # Get the stem of the PDF file
#print(PDF_NAME)
os.makedirs(output_folder, exist_ok=True)
......@@ -39,8 +40,89 @@ pages[0].save(image_path, "JPEG")
# ==== Run OCR ====
image_np = np.array(pages[0])
results = ocr.predict(image_np)
def estimate_text_ratio(gray, block_size=256):
"""Tính median text_ratio theo block nhỏ"""
h, w = gray.shape
ratios = []
for y in range(0, h, block_size):
for x in range(0, w, block_size):
block = gray[y:y+block_size, x:x+block_size]
if block.size == 0:
continue
_, binary = cv2.threshold(block, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
text_mask = 255 - binary
ratio = np.sum(text_mask > 0) / text_mask.size
ratios.append(ratio)
if len(ratios) == 0:
return 0.0
return np.median(ratios) # trung vị để tránh bị outlier
def bolden_text(rgb_img: np.ndarray,
kernel_size: int = 3,
iterations: int = 1,
contrast: float = 1.5,
sharpness: float = 1.2) -> np.ndarray:
"""
Làm đậm chữ trong ảnh RGB:
- kernel_size: kích thước kernel để nở chữ (2 = nhẹ, 3 = mạnh hơn)
- iterations: số lần dilate
- contrast: hệ số tăng tương phản (>=1.0)
- sharpness: hệ số tăng nét (>=1.0)
"""
# RGB -> Gray
gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
# Nhị phân Otsu
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Text mask: chữ = 255
threshold = 0.02
text_ratio = estimate_text_ratio(gray, block_size=256)
print(f"text_ratio={text_ratio:.3f} -> {'Mảnh' if text_ratio < threshold else 'Đậm'}")
if text_ratio > threshold:
return rgb_img
# Dilation
text_mask = 255 - binary
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size, kernel_size))
bold_mask = cv2.dilate(text_mask, kernel, iterations=iterations)
# Overlay lên gray gốc
inv_gray = 255 - gray
inv_gray_boost = np.maximum(inv_gray, bold_mask)
out_gray = 255 - inv_gray_boost
# Tăng contrast (linear scale)
out_gray = cv2.convertScaleAbs(out_gray, alpha=contrast, beta=0)
# Tăng sharpness bằng unsharp mask
blur = cv2.GaussianBlur(out_gray, (0, 0), 0.8)
out_gray = cv2.addWeighted(out_gray, sharpness, blur, -0.2, 0)
# Trả về RGB cho PaddleOCR
out_rgb = cv2.cvtColor(out_gray, cv2.COLOR_GRAY2RGB)
return out_rgb
preprocessed = bolden_text(
image_np,
kernel_size=3, # tăng lên 3 nếu chữ vẫn mảnh
iterations=1, # tăng lên 2 nếu muốn đậm hơn
contrast=1.5, # 1.0 = giữ nguyên, 1.5-2.0 = rõ hơn
sharpness=1.2 # >1.0 để nét hơn
)
debug_path = os.path.join(output_folder, f"{img_base_name}_preprocessed_debug.jpg")
cv2.imwrite(debug_path, cv2.cvtColor(preprocessed, cv2.COLOR_RGB2BGR))
print(f"[DEBUG] Preprocessed image saved to: {debug_path}")
# Gọi OCR (đảm bảo 3 kênh)
if preprocessed.ndim == 2:
preprocessed = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2RGB)
results = ocr.predict(preprocessed)
# ==== Convert polygon to bbox ====
def poly_to_bbox(poly):
xs = [p[0] for p in poly]
......