# utils.py import fitz import cv2 import numpy as np import re def redresser_image_auto(img_array): gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4) h, w = binary.shape kh = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 10, 1)) kv = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 10)) score_h = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kh)) score_v = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kv)) if (score_v / 1.5) > (score_h * 1.3): return cv2.rotate(img_array, cv2.ROTATE_90_COUNTERCLOCKWISE) return img_array def obtenir_zone_tableau_total(img_array): gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) blurred = cv2.GaussianBlur(gray, (5, 5), 0) binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4) h, w = binary.shape kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 5)) dilated = cv2.dilate(binary, kernel, iterations=3) contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if not contours: return 0, h y_points = [] for c in contours: x, y, w_c, h_c = cv2.boundingRect(c) if h_c > 10: y_points.append(y); y_points.append(y + h_c) if not y_points: return 0, h return max(0, min(y_points) - 100), min(h, max(y_points) + 100) def preparer_image_zoom_hd(pdf_path, page_index): """Gère le double passage pour extraire une image HD cadrée.""" doc = fitz.open(pdf_path) page = doc.load_page(page_index) # 1. Localisation basse résolution pix_low = page.get_pixmap(matrix=fitz.Matrix(1, 1)) img_low = np.frombuffer(pix_low.samples, dtype=np.uint8).reshape(pix_low.h, pix_low.w, 3) img_low = redresser_image_auto(img_low) y_min, y_max = obtenir_zone_tableau_total(img_low) # 2. Calcul du recadrage h_low = img_low.shape[0] y_start_pct = y_min / h_low y_end_pct = y_max / h_low full_rect = page.rect crop_rect = fitz.Rect(full_rect.x0, full_rect.y0 + (full_rect.height * y_start_pct), full_rect.x1, full_rect.y0 + (full_rect.height * y_end_pct)) # 3. Rendu Haute Résolution (x4) pix_high = page.get_pixmap(matrix=fitz.Matrix(4, 4), clip=crop_rect, colorspace=fitz.csRGB) img_finale = np.frombuffer(pix_high.samples, dtype=np.uint8).reshape(pix_high.h, pix_high.w, 3) img_finale = redresser_image_auto(img_finale) doc.close() return img_finale def extraire_donnees_ocr(img, ocr_model): """Lance l'OCR et structure les résultats par coordonnées.""" h_f, w_f = img.shape[:2] result = ocr_model.ocr(img, cls=True) extracted = [] if result and result[0]: for line in result[0]: box, (text, conf) = line[0], line[1] if len(re.sub(r'[^a-zA-Z]', '', text)) > 2: continue if conf >= 0.6: x_c, y_c = sum(p[0] for p in box) / 4, sum(p[1] for p in box) / 4 extracted.append({ "text": text, "x_pct": round(x_c / w_f * 100, 1), "y_pct": round(y_c / h_f * 100, 1), "y_c": y_c, "x_c": x_c }) extracted.sort(key=lambda r: (r["y_c"], r["x_c"])) return extracted def nettoyer_texte_ocr(text): if not text: return "" text = str(text) # supprimer artefacts OCR fréquents text = text.replace("]", "").replace("[", "").replace("/", "") # corriger O → 0 uniquement si texte numérique if re.match(r'^[\d\sO]+$', text): text = text.replace("O", "0") # corriger erreurs classiques R/RO text = text.replace("RO", "R0") # espaces propres text = re.sub(r"\s+", " ", text) return text.strip() def est_code_metier(text): return bool(re.match(r'^[A-Z]\d+$', text)) def nettoyage_sortie_ocr(data): cleaned = [] for r in data: # 🔥 sécurité : dict OU string if isinstance(r, dict): text = nettoyer_texte_ocr(r.get("text", "")) x = r.get("x_pct") y = r.get("y_pct") else: text = nettoyer_texte_ocr(str(r)) x = None y = None # ❌ ignorer vide if not text: continue # ❌ bruit pur if re.match(r"^[\W_]+$", text): continue # 🔥 garder codes métier tels quels if est_code_metier(text): cleaned.append({ "x_pct": x, "y_pct": y, "text": text }) continue # ❌ filtrage normal if not re.search(r"[A-Za-z0-9]", text): continue cleaned.append({ "x_pct": x, "y_pct": y, "text": text }) return cleaned def formater_donnees_section(data_page, page_index): """ Format lisible + prêt à parser """ lignes = [f"\n--- DONNÉES PAGE {page_index + 1} ---"] for r in data_page: x = float(r["x_pct"]) y = float(r["y_pct"]) text = r["text"] lignes.append(f"x={x:.1f}% | y={y:.1f}% | {text}") return lignes def to_points(data): """ Convertit directement en format DBSCAN : (x, y, text) """ return [(r["x_pct"], r["y_pct"], r["text"]) for r in data] def sauvegarder_fichier_unique(contenu_total, pdf_path, section_name): """Sauvegarde toutes les pages accumulées dans un seul fichier.""" # Nettoyage du nom de fichier nom_propre = pdf_path.replace('.pdf', '').replace(' ', '_') filename = f"{nom_propre}_{section_name}_complet.txt" with open(filename, "w", encoding="utf-8") as f: f.write("\n".join(contenu_total)) return filename ########## DBSCAN import numpy as np def prepare_for_dbscan(points): """ points = [[x, y, text], ...] """ coords = np.array([[p[1]] for p in points]) # 🔥 uniquement Y return coords from sklearn.cluster import DBSCAN def cluster_lines(points, eps=0.4, min_samples=2): coords = prepare_for_dbscan(points) db = DBSCAN(eps=eps, min_samples=min_samples) labels = db.fit_predict(coords) clusters = {} for label, point in zip(labels, points): if label == -1: continue # bruit clusters.setdefault(label, []).append(point) return list(clusters.values()) def build_lines(clusters): lignes = [] for cluster in clusters: # tri gauche → droite cluster_sorted = sorted(cluster, key=lambda p: p[0]) texte = " ".join([p[2] for p in cluster_sorted]) lignes.append({ "y": np.mean([p[1] for p in cluster]), "text": texte, "points": cluster_sorted }) # tri haut → bas lignes = sorted(lignes, key=lambda l: l["y"]) return lignes def merge_close_lines(lignes, threshold=0.6): merged = [] prev = None for line in lignes: if prev is None: prev = line continue if abs(line["y"] - prev["y"]) < threshold: prev["text"] += " " + line["text"] else: merged.append(prev) prev = line if prev: merged.append(prev) return merged def split_tables(points, eps_y=2.0): """ Sépare les tableaux par distance verticale (Y) """ import numpy as np from sklearn.cluster import DBSCAN y_coords = np.array([[p[1]] for p in points]) # فقط Y clustering = DBSCAN(eps=eps_y, min_samples=5).fit(y_coords) tables = {} for label, point in zip(clustering.labels_, points): if label == -1: continue tables.setdefault(label, []).append(point) return list(tables.values())