function.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # utils.py
  2. import fitz
  3. import cv2
  4. import numpy as np
  5. import re
  6. def redresser_image_auto(img_array):
  7. gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
  8. binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)
  9. h, w = binary.shape
  10. kh = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 10, 1))
  11. kv = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 10))
  12. score_h = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kh))
  13. score_v = cv2.countNonZero(cv2.morphologyEx(binary, cv2.MORPH_OPEN, kv))
  14. if (score_v / 1.5) > (score_h * 1.3):
  15. return cv2.rotate(img_array, cv2.ROTATE_90_COUNTERCLOCKWISE)
  16. return img_array
  17. def obtenir_zone_tableau_total(img_array):
  18. gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
  19. blurred = cv2.GaussianBlur(gray, (5, 5), 0)
  20. binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 4)
  21. h, w = binary.shape
  22. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 5))
  23. dilated = cv2.dilate(binary, kernel, iterations=3)
  24. contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  25. if not contours: return 0, h
  26. y_points = []
  27. for c in contours:
  28. x, y, w_c, h_c = cv2.boundingRect(c)
  29. if h_c > 10:
  30. y_points.append(y); y_points.append(y + h_c)
  31. if not y_points: return 0, h
  32. return max(0, min(y_points) - 100), min(h, max(y_points) + 100)
  33. def preparer_image_zoom_hd(pdf_path, page_index):
  34. """Gère le double passage pour extraire une image HD cadrée."""
  35. doc = fitz.open(pdf_path)
  36. page = doc.load_page(page_index)
  37. print("page fitz",page)
  38. # 1. Localisation basse résolution
  39. pix_low = page.get_pixmap(matrix=fitz.Matrix(1, 1))
  40. img_low = np.frombuffer(pix_low.samples, dtype=np.uint8).reshape(pix_low.h, pix_low.w, 3)
  41. img_low = redresser_image_auto(img_low)
  42. y_min, y_max = obtenir_zone_tableau_total(img_low)
  43. # 2. Calcul du recadrage
  44. h_low = img_low.shape[0]
  45. y_start_pct = y_min / h_low
  46. y_end_pct = y_max / h_low
  47. full_rect = page.rect
  48. crop_rect = fitz.Rect(full_rect.x0, full_rect.y0 + (full_rect.height * y_start_pct),
  49. full_rect.x1, full_rect.y0 + (full_rect.height * y_end_pct))
  50. # 3. Rendu Haute Résolution (x4)
  51. pix_high = page.get_pixmap(matrix=fitz.Matrix(4, 4), clip=crop_rect, colorspace=fitz.csRGB)
  52. img_finale = np.frombuffer(pix_high.samples, dtype=np.uint8).reshape(pix_high.h, pix_high.w, 3)
  53. img_finale = redresser_image_auto(img_finale)
  54. doc.close()
  55. return img_finale
  56. def extraire_donnees_ocr(img, ocr_model):
  57. """Lance l'OCR et structure les résultats par coordonnées."""
  58. h_f, w_f = img.shape[:2]
  59. result = ocr_model.ocr(img, cls=True)
  60. extracted = []
  61. if result and result[0]:
  62. for line in result[0]:
  63. box, (text, conf) = line[0], line[1]
  64. if len(re.sub(r'[^a-zA-Z]', '', text)) > 2:
  65. continue
  66. if conf >= 0.6:
  67. x_c, y_c = sum(p[0] for p in box) / 4, sum(p[1] for p in box) / 4
  68. extracted.append({
  69. "text": text,
  70. "x_pct": round(x_c / w_f * 100, 1),
  71. "y_pct": round(y_c / h_f * 100, 1),
  72. "y_c": y_c,
  73. "x_c": x_c
  74. })
  75. extracted.sort(key=lambda r: (r["y_c"], r["x_c"]))
  76. return extracted
  77. def nettoyer_texte_ocr(text):
  78. if not text:
  79. return ""
  80. text = str(text)
  81. # supprimer artefacts OCR fréquents
  82. text = text.replace("]", "").replace("[", "").replace("/", "")
  83. # corriger O → 0 uniquement si texte numérique
  84. if re.match(r'^[\d\sO]+$', text):
  85. text = text.replace("O", "0")
  86. # corriger erreurs classiques R/RO
  87. text = text.replace("RO", "R0")
  88. # espaces propres
  89. text = re.sub(r"\s+", " ", text)
  90. return text.strip()
  91. def nettoyage_sortie_ocr(data):
  92. """
  93. Nettoie les données OCR pour DBSCAN :
  94. - supprime lignes vides
  95. - supprime artefacts inutiles
  96. - garde textes utiles (mots, codes R/C, nombres)
  97. """
  98. cleaned = []
  99. for r in data:
  100. text = nettoyer_texte_ocr(r.get("text", ""))
  101. # ❌ ignorer lignes totalement vides
  102. if not text:
  103. continue
  104. # ❌ ignorer bruit pur (ex: "---", "...")
  105. if re.match(r"^[\W_]+$", text):
  106. continue
  107. # ❌ ignorer "0" isolé (souvent bruit OCR)
  108. if text == "0":
  109. continue
  110. # garder seulement si:
  111. # - contient du texte utile OU
  112. # - contient chiffre OU
  113. if not re.search(r"[A-Za-z0-9]", text):
  114. continue
  115. cleaned.append({
  116. "x_pct": float(r["x_pct"]),
  117. "y_pct": float(r["y_pct"]),
  118. "text": text
  119. })
  120. return cleaned
  121. def formater_donnees_section(data_page, page_index):
  122. """
  123. Format lisible + prêt à parser
  124. """
  125. lignes = [f"\n--- DONNÉES PAGE {page_index + 1} ---"]
  126. for r in data_page:
  127. x = float(r["x_pct"])
  128. y = float(r["y_pct"])
  129. text = r["text"]
  130. lignes.append(f"x={x:.1f}% | y={y:.1f}% | {text}")
  131. return lignes
  132. def to_points(data):
  133. """
  134. Convertit directement en format DBSCAN :
  135. (x, y, text)
  136. """
  137. return [(r["x_pct"], r["y_pct"], r["text"]) for r in data]
  138. def sauvegarder_fichier_unique(contenu_total, pdf_path, section_name):
  139. """Sauvegarde toutes les pages accumulées dans un seul fichier."""
  140. # Nettoyage du nom de fichier
  141. nom_propre = pdf_path.replace('.pdf', '').replace(' ', '_')
  142. filename = f"{nom_propre}_{section_name}_complet.txt"
  143. with open(filename, "w", encoding="utf-8") as f:
  144. f.write("\n".join(contenu_total))
  145. return filename
  146. ########## DBSCAN
  147. import numpy as np
  148. def prepare_for_dbscan(points):
  149. """
  150. points = [[x, y, text], ...]
  151. """
  152. coords = np.array([[p[1]] for p in points]) # 🔥 uniquement Y
  153. return coords
  154. from sklearn.cluster import DBSCAN
  155. def cluster_lines(points, eps=0.4, min_samples=2):
  156. coords = prepare_for_dbscan(points)
  157. db = DBSCAN(eps=eps, min_samples=min_samples)
  158. labels = db.fit_predict(coords)
  159. clusters = {}
  160. for label, point in zip(labels, points):
  161. if label == -1:
  162. continue # bruit
  163. clusters.setdefault(label, []).append(point)
  164. return list(clusters.values())
  165. def build_lines(clusters):
  166. lignes = []
  167. for cluster in clusters:
  168. # tri gauche → droite
  169. cluster_sorted = sorted(cluster, key=lambda p: p[0])
  170. texte = " ".join([p[2] for p in cluster_sorted])
  171. lignes.append({
  172. "y": np.mean([p[1] for p in cluster]),
  173. "text": texte,
  174. "points": cluster_sorted
  175. })
  176. # tri haut → bas
  177. lignes = sorted(lignes, key=lambda l: l["y"])
  178. return lignes
  179. def merge_close_lines(lignes, threshold=0.6):
  180. merged = []
  181. prev = None
  182. for line in lignes:
  183. if prev is None:
  184. prev = line
  185. continue
  186. if abs(line["y"] - prev["y"]) < threshold:
  187. prev["text"] += " " + line["text"]
  188. else:
  189. merged.append(prev)
  190. prev = line
  191. if prev:
  192. merged.append(prev)
  193. return merged
  194. def split_tables(points, eps_y=2.0):
  195. """
  196. Sépare les tableaux par distance verticale (Y)
  197. """
  198. import numpy as np
  199. from sklearn.cluster import DBSCAN
  200. y_coords = np.array([[p[1]] for p in points]) # فقط Y
  201. clustering = DBSCAN(eps=eps_y, min_samples=5).fit(y_coords)
  202. tables = {}
  203. for label, point in zip(clustering.labels_, points):
  204. if label == -1:
  205. continue
  206. tables.setdefault(label, []).append(point)
  207. return list(tables.values())