Abdenbi
/
SFCR_Extraction


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
							import pandas as pd 
import os
import glob

def extract_pdf_path(filename):
    name_without_ext = filename.replace(".pdf", "")
    parts = name_without_ext.rsplit("_", 1)
    entreprise_name = parts[0]
    année_extraction = parts[1]

    # Chemin basé sur l'emplacement du fichier, pas le dossier courant
    script_dir = os.path.dirname(os.path.abspath(__file__))
    folder_abs = os.path.join(script_dir, "..", "01 - Sources", année_extraction)
    folder_abs = os.path.abspath(folder_abs)

    #print(f"CHEMIN : {folder_abs}")
    #print(f"EXISTE : {os.path.exists(folder_abs)}")
    #print(f"CONTENU : {os.listdir(folder_abs) if os.path.exists(folder_abs) else 'DOSSIER INTROUVABLE'}")

    all_pdfs = glob.glob(os.path.join(folder_abs, "*.pdf"))

    entreprise_lower = entreprise_name.strip().lower()
    matches = [
        p for p in all_pdfs
        if os.path.basename(p).lower().startswith(entreprise_lower)
    ]

    if matches:
        return matches[0]
    else:
        raise FileNotFoundError(
            f"Aucun PDF trouvé pour '{entreprise_name}' en {année_extraction}\n"
            f"PDFs disponibles :\n" +
            "\n".join([os.path.basename(p) for p in all_pdfs])
        )


def extract_pages(filename):
    # Extrait entreprise_name et année depuis le nom du fichier
    name_without_ext = filename.replace(".pdf", "")
    parts = name_without_ext.rsplit("_", 1)
    entreprise_name = parts[0]
    année_extraction = int(parts[1])

    # Chemin basé sur l'emplacement du fichier
    script_dir = os.path.dirname(os.path.abspath(__file__))
    excel_path = os.path.abspath(os.path.join(script_dir, "..", "02 - Inputs", "_QRTs_paramétrages_Abd.xlsx"))

    # Lecture du fichier Excel
    df = pd.read_excel(excel_path, sheet_name="Liste SFCR", header=3)
    df.columns = [str(col).strip() for col in df.columns]

    # Trouve la colonne Entité (insensible à l'encodage)
    entite_col = [col for col in df.columns if "ntit" in col][0]

    # Filtre la ligne correspondante
    row = df[
        (df[entite_col].str.strip().str.lower() == entreprise_name.strip().lower()) &
        (df["Année"] == année_extraction)
    ]

    if row.empty:
        raise ValueError(f"Aucune ligne trouvée pour '{entreprise_name}' en {année_extraction}")

    row = row.iloc[0]

    # Mapping colonnes Excel -> sections SFCR
    section_columns = {
        "S.02": "Pages S.02",
        "S.05": "Pages S.05",
        "S.12": "Pages S.12",
        "S.22": "Pages S.22",
        "S.25": "Pages S.25",
        "S.28": "Pages S.28",
    }

    sections = {}
    for section, col in section_columns.items():
        val = row.get(col, None)
        if pd.notna(val) and str(val).strip() != "":
            # Convertit "135, 136" -> [135, 136]
            pages = [int(p.strip()) for p in str(val).split(",")]
            sections[section] = pages 
        else:
            sections[section] = []

    return sections