Abdenbi
/
SFCR_Extraction


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
							import pandas as pd 
import os
import glob

def extract_pdf_path(filename):
    name_without_ext = filename.replace(".pdf", "")
    parts = name_without_ext.rsplit("_", 1)
    entreprise_name = parts[0]
    année_extraction = parts[1]

    # Chemin basé sur l'emplacement du fichier, pas le dossier courant
    script_dir = os.path.dirname(os.path.abspath(__file__))
    folder_abs = os.path.join(script_dir, "..", "01 - Sources", année_extraction)
    folder_abs = os.path.abspath(folder_abs)

    all_pdfs = glob.glob(os.path.join(folder_abs, "*.pdf"))

    entreprise_lower = entreprise_name.strip().lower()

    matches = []

    for p in all_pdfs:
        filename_only = os.path.basename(p)
        source_name = filename_only.split("-")[0].strip().lower()

        if source_name == entreprise_lower:
            matches.append(p)

    if matches:
        return matches[0]
    else:
        raise FileNotFoundError(
            f"Aucun PDF trouvé pour '{entreprise_name}' en {année_extraction}\n"
            f"PDFs disponibles :\n" +
            "\n".join([os.path.basename(p) for p in all_pdfs])
        )

   

def extract_pages(filename):
    # Extrait entreprise_name et année depuis le nom du fichier
    name_without_ext = filename.replace(".pdf", "")
    parts = name_without_ext.rsplit("_", 1)
    entreprise_name = parts[0]
    année_extraction = int(parts[1])

    # Chemin basé sur l'emplacement du fichier
    script_dir = os.path.dirname(os.path.abspath(__file__))
    excel_path = os.path.abspath(os.path.join(script_dir, "..", "02 - Inputs", "_QRTs_paramétrages_Abd.xlsx"))

    # Lecture du fichier Excel
    df = pd.read_excel(excel_path, sheet_name="Liste SFCR", header=3)
    df.columns = [str(col).strip() for col in df.columns]

    # Trouve la colonne Entité (insensible à l'encodage)
    entite_col = [col for col in df.columns if "ntit" in col][0]

    # Filtre la ligne correspondante
    row = df[
        (df[entite_col].str.strip().str.lower() == entreprise_name.strip().lower()) &
        (df["Année"] == année_extraction)
    ]

    if row.empty:
        raise ValueError(f"Aucune ligne trouvée pour '{entreprise_name}' en {année_extraction}")

    row = row.iloc[0]

    # Mapping colonnes Excel -> sections SFCR
    section_columns = {
        "S.02": "Pages S.02",
        "S.04": "Pages S.04",
        "S.05": "Pages S.05",
        "S.12": "Pages S.12",
        "S.17": "Pages S.17",
        "S.19": "Pages S.19",
        "S.22": "Pages S.22",
        "S.23": "Pages S.23",
        "S.25": "Pages S.25",
        "S.28": "Pages S.28",
    }

    sections = {}
    for section, col in section_columns.items():
        val = row.get(col, None)
        if pd.notna(val) and str(val).strip() != "":
            # Convertit "135, 136" -> [135, 136]
            pages = [int(p.strip()) for p in str(val).split(",")]
            sections[section] = pages 
        else:
            print(f"Aucune page trouvée pour la section {section} ")
            sections[section] = []

    return sections