Abdenbi
/
SFCR_Extraction


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
							# utils.py
import fitz
import cv2
import numpy as np
import re

def redresser_image_auto(img_array):
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)
    h, w = binary.shape
    kh = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 10, 1))
    kv = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 10))
    score_h = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kh))
    score_v = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kv))
    if (score_v / 1.5) > (score_h * 1.3):
        return cv2.rotate(img_array, cv2.ROTATE_90_COUNTERCLOCKWISE)
    return img_array

def obtenir_zone_tableau_total(img_array):
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)
    h, w = binary.shape
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 5))
    dilated = cv2.dilate(binary, kernel, iterations=3)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours: return 0, h
    y_points = []
    for c in contours:
        x, y, w_c, h_c = cv2.boundingRect(c)
        if h_c > 10:
            y_points.append(y); y_points.append(y + h_c)
    if not y_points: return 0, h
    return max(0, min(y_points) - 100), min(h, max(y_points) + 100)

def preparer_image_zoom_hd(pdf_path, page_index):
    """Gère le double passage pour extraire une image HD cadrée."""
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_index)

    
    # 1. Localisation basse résolution
    pix_low = page.get_pixmap(matrix=fitz.Matrix(1, 1))
    img_low = np.frombuffer(pix_low.samples, dtype=np.uint8).reshape(pix_low.h, pix_low.w, 3)
    img_low = redresser_image_auto(img_low)
    y_min, y_max = obtenir_zone_tableau_total(img_low)
    
    # 2. Calcul du recadrage
    h_low = img_low.shape[0]
    y_start_pct = y_min / h_low
    y_end_pct = y_max / h_low
    
    full_rect = page.rect
    crop_rect = fitz.Rect(full_rect.x0, full_rect.y0 + (full_rect.height * y_start_pct),
                          full_rect.x1, full_rect.y0 + (full_rect.height * y_end_pct))
    
    # 3. Rendu Haute Résolution (x4)
    pix_high = page.get_pixmap(matrix=fitz.Matrix(4, 4), clip=crop_rect, colorspace=fitz.csRGB)
    img_finale = np.frombuffer(pix_high.samples, dtype=np.uint8).reshape(pix_high.h, pix_high.w, 3)
    img_finale = redresser_image_auto(img_finale)
    
    doc.close()
    return img_finale

def extraire_donnees_ocr(img, ocr_model):
    """Lance l'OCR et structure les résultats par coordonnées."""
    h_f, w_f = img.shape[:2]
    result = ocr_model.ocr(img, cls=True)
    extracted = []
    if result and result[0]:
        for line in result[0]:
            box, (text, conf) = line[0], line[1]

            if len(re.sub(r'[^a-zA-Z]', '', text)) > 2:
             continue


            if conf >= 0.6:
                x_c, y_c = sum(p[0] for p in box) / 4, sum(p[1] for p in box) / 4
                extracted.append({
                    "text": text, 
                    "x_pct": round(x_c / w_f * 100, 1),
                    "y_pct": round(y_c / h_f * 100, 1), 
                    "y_c": y_c,
                      "x_c": x_c
                })
    extracted.sort(key=lambda r: (r["y_c"], r["x_c"]))
    return extracted


def nettoyer_texte_ocr(text):
    if not text:
        return ""

    text = str(text)

    # supprimer artefacts OCR fréquents
    text = text.replace("]", "").replace("[", "").replace("/", "")

    # corriger O → 0 uniquement si texte numérique
    if re.match(r'^[\d\sO]+$', text):
        text = text.replace("O", "0")

    # corriger erreurs classiques R/RO
    text = text.replace("RO", "R0")

    # espaces propres
    text = re.sub(r"\s+", " ", text)

    return text.strip()


def est_code_metier(text):
    return bool(re.match(r'^[A-Z]\d+$', text))


def nettoyage_sortie_ocr(data):

    cleaned = []

    for r in data:

        # 🔥 sécurité : dict OU string
        if isinstance(r, dict):
            text = nettoyer_texte_ocr(r.get("text", ""))
            x = r.get("x_pct")
            y = r.get("y_pct")
        else:
            text = nettoyer_texte_ocr(str(r))
            x = None
            y = None

        # ❌ ignorer vide
        if not text:
            continue

        # ❌ bruit pur
        if re.match(r"^[\W_]+$", text):
            continue

        # 🔥 garder codes métier tels quels
        if est_code_metier(text):
            cleaned.append({
                "x_pct": x,
                "y_pct": y,
                "text": text
            })
            continue

        # ❌ filtrage normal
        if not re.search(r"[A-Za-z0-9]", text):
            continue

        cleaned.append({
            "x_pct": x,
            "y_pct": y,
            "text": text
        })

    return cleaned

def formater_donnees_section(data_page, page_index):
    """
    Format lisible + prêt à parser
    """

    lignes = [f"\n--- DONNÉES PAGE {page_index + 1} ---"]

    for r in data_page:
        x = float(r["x_pct"])
        y = float(r["y_pct"])
        text = r["text"]

        lignes.append(f"x={x:.1f}% | y={y:.1f}% | {text}")

    return lignes

def to_points(data):
    """
    Convertit directement en format DBSCAN :
    (x, y, text)
    """
    return [(r["x_pct"], r["y_pct"], r["text"]) for r in data]


def sauvegarder_fichier_unique(contenu_total, pdf_path, section_name):
    """Sauvegarde toutes les pages accumulées dans un seul fichier."""
    # Nettoyage du nom de fichier
    nom_propre = pdf_path.replace('.pdf', '').replace(' ', '_')
    filename = f"{nom_propre}_{section_name}_complet.txt"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(contenu_total))
    
    return filename


########## DBSCAN 
import numpy as np

def prepare_for_dbscan(points):
    """
    points = [[x, y, text], ...]
    """
    coords = np.array([[p[1]] for p in points])  # 🔥 uniquement Y
    return coords

from sklearn.cluster import DBSCAN

def cluster_lines(points, eps=0.4, min_samples=2):
    coords = prepare_for_dbscan(points)

    db = DBSCAN(eps=eps, min_samples=min_samples)
    labels = db.fit_predict(coords)

    clusters = {}
    for label, point in zip(labels, points):
        if label == -1:
            continue  # bruit
        clusters.setdefault(label, []).append(point)

    return list(clusters.values())

def build_lines(clusters):
    lignes = []

    for cluster in clusters:
        # tri gauche → droite
        cluster_sorted = sorted(cluster, key=lambda p: p[0])

        texte = " ".join([p[2] for p in cluster_sorted])

        lignes.append({
            "y": np.mean([p[1] for p in cluster]),
            "text": texte,
            "points": cluster_sorted
        })

    # tri haut → bas
    lignes = sorted(lignes, key=lambda l: l["y"])

    return lignes


def merge_close_lines(lignes, threshold=0.6):
    merged = []
    prev = None

    for line in lignes:
        if prev is None:
            prev = line
            continue

        if abs(line["y"] - prev["y"]) < threshold:
            prev["text"] += " " + line["text"]
        else:
            merged.append(prev)
            prev = line

    if prev:
        merged.append(prev)

    return merged


def split_tables(points, eps_y=2.0):
    """
    Sépare les tableaux par distance verticale (Y)
    """
    import numpy as np
    from sklearn.cluster import DBSCAN

    y_coords = np.array([[p[1]] for p in points])  # فقط Y

    clustering = DBSCAN(eps=eps_y, min_samples=5).fit(y_coords)

    tables = {}
    for label, point in zip(clustering.labels_, points):
        if label == -1:
            continue
        tables.setdefault(label, []).append(point)

    return list(tables.values())