Abdenbi
/
SFCR_Extraction


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
							
import json


def get_closest_col(x, current_cols, max_distance=8.0):
    """Trouve la colonne la plus proche, dans une distance max (en unités PDF)."""
    best = min(current_cols, key=lambda c: abs(x - c[0]))
    if abs(x - best[0]) <= max_distance:
        return best
    return None  # Aucune colonne assez proche

def transform_to_clean_markdown(data):
    current_cols = []
    output_lines = []

    for entry in data:
        points = entry.get("points", [])
        if not points:
            continue

        # 1. Détection des en-têtes : tout ce qui est sur une ligne "header"
        # On détecte la ligne header si elle contient AU MOINS un Cxxxx
        has_header = any(str(p[2]).startswith('C') for p in points)

        if has_header:
            if output_lines:
                output_lines.append("\n---\n")

            # On prend TOUS les points comme colonnes (y compris 06000, etc.)
            # sauf ceux qui ressemblent à un label de ligne (Rxxxx)
            current_cols = [(p[0], p[2]) for p in points if not str(p[2]).startswith('R')]
            col_names = [c[1] for c in current_cols]

            output_lines.append(f"| Code | {' | '.join(col_names)} |")
            output_lines.append(f"| :--- | {' | '.join([':---'] * len(col_names))} |")
            continue

        # 2. Lignes de données (Rxxxx)
        row_label_pt = next((p for p in points if str(p[2]).startswith('R')), None)

        if row_label_pt and current_cols:
            row_label = row_label_pt[2]
            row_dict = {c[1]: "0" for c in current_cols}

            for p in points:
                x_val, _, text = p
                if text == row_label:
                    continue

                best_col = get_closest_col(x_val, current_cols, max_distance=8.0)
                if best_col:
                    row_dict[best_col[1]] = str(text).strip()
                # Si aucune colonne proche → on ignore (valeur hors tableau)

            ordered_values = [row_dict[c[1]] for c in current_cols]
            output_lines.append(f"| **{row_label}** | {' | '.join(ordered_values)} |")

    return "\n".join(output_lines)

#print(transform_to_clean_markdown(json))