#!/usr/bin/env python3 import os import re import json import argparse import warnings from typing import List, Optional, Tuple import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MiniBatchKMeans from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score from sklearn.preprocessing import StandardScaler from sklearn.neighbors import NearestNeighbors import matplotlib.pyplot as plt import seaborn as sns import joblib import tensorflow as tf from keras.applications import MobileNetV2, EfficientNetB0 from keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess from keras.applications.efficientnet import preprocess_input as efficientnet_preprocess from keras import backend as K os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" K.set_image_data_format("channels_last") # ----------------------------- # Utils # ----------------------------- def set_seed(seed: int = 42): np.random.seed(seed) tf.random.set_seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) def ensure_dir(path: str): os.makedirs(path, exist_ok=True) def _read_csv_any(path: str) -> pd.DataFrame: for enc in ("utf-8", "utf-8-sig", "latin-1"): try: return pd.read_csv(path, encoding=enc) except UnicodeDecodeError: continue return pd.read_csv(path, encoding="utf-8", errors="replace") def _normalize_col_name(name: str) -> str: if not isinstance(name, str): return "" s = name.strip().lower() print(f"Normalizing column name: '{name}' -> '{s}'") m = re.match(r"^(.*)_(a|b)$", s) print(f" matched: {m}") if m: s = m.group(1) for ch in [" ", "_", "-", ".", "/"]: s = s.replace(ch, "") return s def find_matching_cols(df: pd.DataFrame, aliases: List[str]) -> List[str]: tgt = {_normalize_col_name(a) for a in aliases} out = [] for c in df.columns: if _normalize_col_name(c) in tgt: out.append(c) return out def best_filename_from_row(row: pd.Series, img_ext: str = ".jpg") -> Optional[str]: for key in ["filename", "file_name", "image", "image_name", "New_Name_With_Date", "New_Name", "Nombre_Nuevo", "Old_Name"]: if key in row and pd.notna(row[key]) and str(row[key]).strip() != "": fname = str(row[key]).strip() if not os.path.splitext(fname)[1]: fname = fname + img_ext return fname for key in ["basename_final", "basename"]: if key in row and pd.notna(row[key]) and str(row[key]).strip() != "": return f"{row[key]}{img_ext}" return None def attach_paths_single_csv(df: pd.DataFrame, images_dir: str, img_ext: str = ".jpg", search_subdirs: bool = False) -> pd.DataFrame: paths = [] miss = 0 for _, r in df.iterrows(): fname = best_filename_from_row(r, img_ext) if not fname: paths.append((None, None)) miss += 1 continue p = os.path.join(images_dir, fname) if not os.path.exists(p) and search_subdirs: # buscar en subcarpetas found = None for root, _, files in os.walk(images_dir): if fname in files: found = os.path.join(root, fname) break p = found if found else p paths.append((fname, p if p and isinstance(p, str) and os.path.exists(p) else None)) if paths[-1][1] is None: miss += 1 if miss: warnings.warn(f"{miss} archivos listados no existen en disco. Serán ignorados.") out = df.copy() out["filename"] = [t[0] for t in paths] out["path"] = [t[1] for t in paths] out = out[pd.notna(out["path"])].reset_index(drop=True) return out # ----------------------------- # Embeddings # ----------------------------- def make_preprocess(backbone: str): return mobilenet_preprocess if backbone == "mobilenet" else efficientnet_preprocess def make_backbone_model(img_size: int, backbone: str) -> tf.keras.Model: tf.keras.backend.clear_session() K.set_image_data_format("channels_last") input_shape = (img_size, img_size, 3) if backbone == "efficientnet": try: model = EfficientNetB0(include_top=False, weights="imagenet", input_shape=input_shape, pooling="avg") except Exception as e: warnings.warn(f"No se pudo cargar EfficientNetB0 con pesos ImageNet ({e}). Se usarán pesos aleatorios.") model = EfficientNetB0(include_top=False, weights=None, input_shape=input_shape, pooling="avg") else: model = MobileNetV2(include_top=False, weights="imagenet", input_shape=input_shape, pooling="avg") model.trainable = False return model def build_dataset(paths: List[str], img_size: int, preprocess_fn, batch_size: int = 64) -> tf.data.Dataset: ds = tf.data.Dataset.from_tensor_slices(paths) def _load_tf(p): x = tf.io.read_file(p) x = tf.image.decode_jpeg(x, channels=3) x = tf.image.resize(x, [img_size, img_size], method="bilinear", antialias=True) x = tf.cast(x, tf.float32) x = preprocess_fn(x) return x return ds.map(_load_tf, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE) def compute_embeddings(model: tf.keras.Model, ds: tf.data.Dataset) -> np.ndarray: return model.predict(ds, verbose=1) # ----------------------------- # Reduction + clustering # ----------------------------- def fit_reduction(train_emb: np.ndarray, n_pca: int = 50): scaler = StandardScaler() Xs = scaler.fit_transform(train_emb) pca = PCA(n_components=min(n_pca, Xs.shape[1])) Z = pca.fit_transform(Xs) return scaler, pca, Z def transform_reduction(emb: np.ndarray, scaler: StandardScaler, pca: PCA) -> np.ndarray: return pca.transform(scaler.transform(emb)) def _centers_from_labels(X: np.ndarray, y: np.ndarray) -> Optional[np.ndarray]: cs = [] for c in sorted(set(y)): if c == -1: continue cs.append(X[y == c].mean(axis=0)) return np.array(cs) if cs else None def tune_dbscan(train_feats: np.ndarray, metric: str = "euclidean", min_samples_grid=(3, 5, 10), quantiles=(0.6, 0.7, 0.8, 0.9)) -> Tuple[Optional[DBSCAN], Optional[np.ndarray], Optional[np.ndarray]]: best = {"score": -np.inf, "model": None, "labels": None} for ms in min_samples_grid: k = max(2, min(ms, len(train_feats)-1)) nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(train_feats) dists, _ = nbrs.kneighbors(train_feats) kth = np.sort(dists[:, -1]) for q in quantiles: eps = float(np.quantile(kth, q)) m = DBSCAN(eps=eps, min_samples=ms, metric=metric, n_jobs=-1) y = m.fit_predict(train_feats) valid = y[y != -1] if len(np.unique(valid)) < 2: continue try: score = silhouette_score(train_feats[y != -1], y[y != -1]) except Exception: score = -np.inf if score > best["score"]: best = {"score": score, "model": m, "labels": y} if best["model"] is None: return None, None, None return best["model"], best["labels"], _centers_from_labels(train_feats, best["labels"]) def fit_cluster_algo(kind: str, n_clusters: int, train_feats: np.ndarray, fast_kmeans: bool = True, dbscan_eps: float = 0.8, dbscan_min_samples: int = 5, dbscan_metric: str = "euclidean", dbscan_auto: bool = False): if kind == "kmeans": m = MiniBatchKMeans(n_clusters=n_clusters, batch_size=2048, n_init=10, random_state=42) if fast_kmeans \ else KMeans(n_clusters=n_clusters, n_init=10, random_state=42) y = m.fit_predict(train_feats) return m, y, getattr(m, "cluster_centers_", None) if kind == "dbscan": if dbscan_auto: m, y, centers = tune_dbscan(train_feats, metric=dbscan_metric) if m is None: warnings.warn("DBSCAN(auto) no encontró ≥2 clusters. Fallback a KMeans.") km = MiniBatchKMeans(n_clusters=max(n_clusters, 2), batch_size=2048, n_init=10, random_state=42) y = km.fit_predict(train_feats) return km, y, km.cluster_centers_ print(f"DBSCAN(auto) seleccionado (metric={dbscan_metric}).") return m, y, centers m = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples, metric=dbscan_metric, n_jobs=-1) y = m.fit_predict(train_feats) uniq = set(y) - {-1} if len(uniq) < 2: warnings.warn(f"DBSCAN devolvió {len(uniq)} cluster(s) válido(s). Considera ajustar eps/min_samples/metric o usar --dbscan_auto.") return m, y, _centers_from_labels(train_feats, y) ag = AgglomerativeClustering(n_clusters=n_clusters) y = ag.fit_predict(train_feats) centers = _centers_from_labels(train_feats, y) return ag, y, centers def assign_to_nearest_centroid(feats: np.ndarray, centers: Optional[np.ndarray]) -> np.ndarray: if centers is None or len(centers) == 0: return np.full((feats.shape[0],), -1, dtype=int) d = ((feats[:, None, :] - centers[None, :, :]) ** 2).sum(axis=2) return np.argmin(d, axis=1) def internal_metrics(X: np.ndarray, y: np.ndarray) -> dict: m = y != -1 if m.sum() > 1 and len(np.unique(y[m])) > 1: return { "silhouette": float(silhouette_score(X[m], y[m])), "calinski_harabasz": float(calinski_harabasz_score(X[m], y[m])), "davies_bouldin": float(davies_bouldin_score(X[m], y[m])), } return {"silhouette": None, "calinski_harabasz": None, "davies_bouldin": None} # ----------------------------- # Plot # ----------------------------- def plot_scatter_2d(X2d: np.ndarray, labels: np.ndarray, title: str, out_path: str): plt.figure(figsize=(8, 6)) uniq = np.unique(labels) if len(uniq) <= 1: sns.scatterplot(x=X2d[:, 0], y=X2d[:, 1], s=12, linewidth=0, color="#1f77b4", legend=False) else: palette = sns.color_palette("tab20", n_colors=len(uniq)) sns.scatterplot(x=X2d[:, 0], y=X2d[:, 1], hue=labels, palette=palette, s=12, linewidth=0, legend=False) plt.title(title) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() # ----------------------------- # Main # ----------------------------- def parse_args(): p = argparse.ArgumentParser(description="Unsupervised clustering for Carciofo (single CSV)") p.add_argument("--images_dir", default=r"C:\Users\sof12\Desktop\ML\Datasets\Carciofo\GBIF", help="Carpeta que contiene las imágenes") p.add_argument("--csv_path", default=r"C:\Users\sof12\Desktop\ML\Datasets\Carciofo\GBIF\joined_metadata.csv") p.add_argument("--out_dir", default=r"C:\Users\sof12\Desktop\ML\Datasets\Carciofo\GBIF\TrainingTEST_PCA_V1_C") p.add_argument("--img_ext", default=".jpg") p.add_argument("--img_size", type=int, default=224) p.add_argument("--batch_size", type=int, default=64) p.add_argument("--seed", type=int, default=42) p.add_argument("--sample", type=int, default=None) p.add_argument("--search_subdirs", action="store_true", help="Buscar archivos faltantes en subcarpetas") p.add_argument("--backbone", choices=["mobilenet", "efficientnet"], default="mobilenet") p.add_argument("--cluster", choices=["kmeans", "dbscan", "agglomerative"], default="kmeans") p.add_argument("--n_clusters", type=int, default=5) p.add_argument("--fast_kmeans", action="store_true") # DBSCAN p.add_argument("--dbscan_eps", type=float, default=0.8) p.add_argument("--dbscan_min_samples", type=int, default=5) p.add_argument("--dbscan_metric", choices=["euclidean", "cosine", "manhattan"], default="euclidean") p.add_argument("--dbscan_auto", action="store_true") return p.parse_args() # ...existing code... def main(): args = parse_args() set_seed(args.seed) ensure_dir(args.out_dir) print("Loading CSV...") df = _read_csv_any(args.csv_path) print("Resolving filenames and verifying files on disk...") df = attach_paths_single_csv(df, args.images_dir, img_ext=args.img_ext, search_subdirs=args.search_subdirs) if len(df) == 0: print("No images found. Check images_dir and csv_path.") return # --- Solo 'fase' (Carciofo no usa 'fase V' / 'fase R') --- phase_cols = find_matching_cols(df, ["fase"]) if phase_cols: ser_phase = None for c in phase_cols: ser_phase = df[c] if ser_phase is None else ser_phase.combine_first(df[c]) df["fase"] = ser_phase print(f"Using column(s) for 'fase': {phase_cols}") else: warnings.warn("No se encontró columna 'fase' en el CSV. No se incluirá en el output.") # --- fin fase --- # Optional sampling if args.sample is not None and args.sample < len(df): df = df.sample(n=args.sample, random_state=args.seed).reset_index(drop=True) # Split indices print("Splitting train/val/test...") idx_all = np.arange(len(df)) idx_train, idx_tmp = train_test_split(idx_all, test_size=0.30, random_state=args.seed, shuffle=True) idx_val, idx_test = train_test_split(idx_tmp, test_size=0.50, random_state=args.seed, shuffle=True) df_train = df.iloc[idx_train].reset_index(drop=True) df_val = df.iloc[idx_val].reset_index(drop=True) df_test = df.iloc[idx_test].reset_index(drop=True) # Embeddings in one pass print("Building embedding model...") preprocess_fn = make_preprocess(args.backbone) model = make_backbone_model(args.img_size, args.backbone) print("Computing embeddings (one pass)...") ds_all = build_dataset(df["path"].tolist(), args.img_size, preprocess_fn, args.batch_size) emb_all = compute_embeddings(model, ds_all) emb_train = emb_all[idx_train] emb_val = emb_all[idx_val] emb_test = emb_all[idx_test] # PCA reduction print("Fitting PCA reduction (50D for clustering, 2D for plots)...") scaler, pca50, train_50 = fit_reduction(emb_train, n_pca=50) val_50 = transform_reduction(emb_val, scaler, pca50) test_50 = transform_reduction(emb_test, scaler, pca50) pca2 = PCA(n_components=2).fit(scaler.transform(emb_train)) train_2d = pca2.transform(scaler.transform(emb_train)) val_2d = pca2.transform(scaler.transform(emb_val)) test_2d = pca2.transform(scaler.transform(emb_test)) # Clustering print(f"Clustering with {args.cluster}...") model_c, y_train, centers = fit_cluster_algo( args.cluster, args.n_clusters, train_50, fast_kmeans=args.fast_kmeans, dbscan_eps=args.dbscan_eps, dbscan_min_samples=args.dbscan_min_samples, dbscan_metric=args.dbscan_metric, dbscan_auto=args.dbscan_auto, ) if args.cluster == "kmeans": y_val = model_c.predict(val_50) y_test = model_c.predict(test_50) else: y_val = assign_to_nearest_centroid(val_50, centers) y_test = assign_to_nearest_centroid(test_50, centers) # Metrics print("Computing internal metrics...") train_m = internal_metrics(train_50, y_train) val_m = internal_metrics(val_50, y_val) test_m = internal_metrics(test_50, y_test) # Save outputs (filename, fase, cluster, split) print("Saving outputs...") ensure_dir(args.out_dir) def pick_min(df_split: pd.DataFrame, y: np.ndarray, split: str) -> pd.DataFrame: cols = ["filename", "fase"] keep = [c for c in cols if c in df_split.columns] out = df_split[keep].copy() out["cluster"] = y out["split"] = split return out train_out = pick_min(df_train, y_train, "train") val_out = pick_min(df_val, y_val, "val") test_out = pick_min(df_test, y_test, "test") assignments = pd.concat([train_out, val_out, test_out], ignore_index=True) assignments.to_csv(os.path.join(args.out_dir, "assignments.csv"), index=False, encoding="utf-8") train_out.to_csv(os.path.join(args.out_dir, "train_assignments.csv"), index=False, encoding="utf-8") val_out.to_csv(os.path.join(args.out_dir, "val_assignments.csv"), index=False, encoding="utf-8") test_out.to_csv(os.path.join(args.out_dir, "test_assignments.csv"), index=False, encoding="utf-8") # Save models joblib.dump(scaler, os.path.join(args.out_dir, "scaler.joblib")) joblib.dump(pca50, os.path.join(args.out_dir, "pca50.joblib")) joblib.dump(pca2, os.path.join(args.out_dir, "pca2.joblib")) joblib.dump(model_c, os.path.join(args.out_dir, f"{args.cluster}.joblib")) # Plots plot_scatter_2d(train_2d, y_train, f"Train clusters ({args.cluster})", os.path.join(args.out_dir, "train_clusters_2d.png")) plot_scatter_2d(val_2d, y_val, f"Val clusters ({args.cluster})", os.path.join(args.out_dir, "val_clusters_2d.png")) plot_scatter_2d(test_2d, y_test, f"Test clusters ({args.cluster})", os.path.join(args.out_dir, "test_clusters_2d.png")) # Summary summary = { "counts": {"train": len(df_train), "val": len(df_val), "test": len(df_test)}, "cluster": args.cluster, "n_clusters": args.n_clusters, "backbone": args.backbone, "img_size": args.img_size, "internal_metrics": {"train": train_m, "val": val_m, "test": test_m}, "csv": os.path.join(args.out_dir, "assignments.csv"), } with open(os.path.join(args.out_dir, "summary.json"), "w", encoding="utf-8") as f: json.dump(summary, f, indent=2, ensure_ascii=False) # Optional: save features np.save(os.path.join(args.out_dir, "features.npy"), emb_all) np.save(os.path.join(args.out_dir, "feature_paths.npy"), df["path"].to_numpy()) print("Done. Results saved to:", args.out_dir) if __name__ == "__main__": main()