clean_DBSCAN.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import json
  2. def get_closest_col(x, current_cols, max_distance=8.0):
  3. """Trouve la colonne la plus proche, dans une distance max (en unités PDF)."""
  4. best = min(current_cols, key=lambda c: abs(x - c[0]))
  5. if abs(x - best[0]) <= max_distance:
  6. return best
  7. return None # Aucune colonne assez proche
  8. def transform_to_clean_markdown(data):
  9. current_cols = []
  10. output_lines = []
  11. for entry in data:
  12. points = entry.get("points", [])
  13. if not points:
  14. continue
  15. # 1. Détection des en-têtes : tout ce qui est sur une ligne "header"
  16. # On détecte la ligne header si elle contient AU MOINS un Cxxxx
  17. has_header = any(str(p[2]).startswith('C') for p in points)
  18. if has_header:
  19. if output_lines:
  20. output_lines.append("\n---\n")
  21. # On prend TOUS les points comme colonnes (y compris 06000, etc.)
  22. # sauf ceux qui ressemblent à un label de ligne (Rxxxx)
  23. current_cols = [(p[0], p[2]) for p in points if not str(p[2]).startswith('R')]
  24. col_names = [c[1] for c in current_cols]
  25. output_lines.append(f"| Code | {' | '.join(col_names)} |")
  26. output_lines.append(f"| :--- | {' | '.join([':---'] * len(col_names))} |")
  27. continue
  28. # 2. Lignes de données (Rxxxx)
  29. row_label_pt = next((p for p in points if str(p[2]).startswith('R')), None)
  30. if row_label_pt and current_cols:
  31. row_label = row_label_pt[2]
  32. row_dict = {c[1]: "0" for c in current_cols}
  33. for p in points:
  34. x_val, _, text = p
  35. if text == row_label:
  36. continue
  37. best_col = get_closest_col(x_val, current_cols, max_distance=8.0)
  38. if best_col:
  39. row_dict[best_col[1]] = str(text).strip()
  40. # Si aucune colonne proche → on ignore (valeur hors tableau)
  41. ordered_values = [row_dict[c[1]] for c in current_cols]
  42. output_lines.append(f"| **{row_label}** | {' | '.join(ordered_values)} |")
  43. return "\n".join(output_lines)
  44. #print(transform_to_clean_markdown(json))