487 lines
21 KiB
Python
487 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# Script for unsupervised image clustering using PCA and various clustering algorithms.
|
|
# It merges metadata from two CSV files, computes image embeddings using a pre-trained
|
|
# CNN backbone, reduces dimensionality with PCA, and applies clustering algorithms.
|
|
# Author: Sofia Garcias Arcila
|
|
|
|
# Imports: load the necessary libraries, argparse for command-line arguments.
|
|
import os
|
|
import re
|
|
import json
|
|
import argparse
|
|
import warnings
|
|
from typing import List, Optional
|
|
|
|
# Numerical and data handling
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MiniBatchKMeans
|
|
from sklearn.metrics import (
|
|
silhouette_score,
|
|
calinski_harabasz_score,
|
|
davies_bouldin_score,
|
|
)
|
|
from sklearn.preprocessing import StandardScaler
|
|
import joblib
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
# Model base to extract embeddings
|
|
import tensorflow as tf
|
|
from keras.applications import MobileNetV2, EfficientNetB0
|
|
from keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess
|
|
from keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
|
|
from keras import backend as K
|
|
|
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
|
K.set_image_data_format("channels_last")
|
|
|
|
# -----------------------------
|
|
# Utils functions, i write them just to keerp clean the code and evit repetitions
|
|
# -----------------------------
|
|
|
|
# Set random seeds for reproducibility
|
|
def set_seed(seed: int = 42):
|
|
np.random.seed(seed)
|
|
tf.random.set_seed(seed)
|
|
os.environ["PYTHONHASHSEED"] = str(seed)
|
|
|
|
# Ensure directory exists, and if not, create it
|
|
def ensure_dir(path: str):
|
|
os.makedirs(path, exist_ok=True)
|
|
|
|
# Extract base name without extension from a string, because of the name of images
|
|
def guess_basename(s: Optional[str]) -> Optional[str]:
|
|
if s is None or (isinstance(s, float) and np.isnan(s)) or str(s).strip() == "":
|
|
return None
|
|
name = os.path.basename(str(s))
|
|
base, _ = os.path.splitext(name)
|
|
return base if base else None
|
|
|
|
# Find the first existing column from a list of candidates in a DataFrame
|
|
def first_existing_column(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
|
|
for c in candidates:
|
|
if c in df.columns:
|
|
return c
|
|
return None
|
|
|
|
# Normalize column names for matching, good practice because we are using 2 different CSVs
|
|
def _normalize_col_name(name: str) -> str:
|
|
if not isinstance(name, str):
|
|
return ""
|
|
s = name.strip().lower()
|
|
m = re.match(r"^(.*)_(a|b)$", s) # remove _a/_b suffix from merge if exists, because later in the code we change the name of the columns
|
|
if m:
|
|
s = m.group(1)
|
|
for ch in [" ", "_", "-", ".", "/"]:
|
|
s = s.replace(ch, "")
|
|
return s
|
|
|
|
|
|
def find_matching_cols(df: pd.DataFrame, aliases: List[str]) -> List[str]:
|
|
targets = {_normalize_col_name(a) for a in aliases}
|
|
matches = []
|
|
for col in df.columns:
|
|
if _normalize_col_name(col) in targets:
|
|
matches.append(col)
|
|
return matches
|
|
|
|
# Build filename from row based on priority of columns
|
|
def build_filename_from_row(row: pd.Series, img_ext: str = ".jpg") -> Optional[str]:
|
|
for key in ["New_Name_With_Date", "New_Name", "Nombre_Nuevo"]:
|
|
if key in row and pd.notna(row[key]) and str(row[key]).strip() != "":
|
|
fname = str(row[key]).strip()
|
|
if not os.path.splitext(fname)[1]:
|
|
fname = fname + img_ext
|
|
return fname
|
|
for key in ["basename_final", "basename"]:
|
|
if key in row and pd.notna(row[key]) and str(row[key]).strip() != "":
|
|
return f"{row[key]}{img_ext}"
|
|
if "Old_Name" in row and pd.notna(row["Old_Name"]) and str(row["Old_Name"]).strip() != "":
|
|
fname = str(row["Old_Name"]).strip()
|
|
if not os.path.splitext(fname)[1]:
|
|
fname = fname + img_ext
|
|
return fname
|
|
return None
|
|
|
|
# -----------------------------
|
|
# Load and merge CSVs, because we have 2 different CSVs with different columns
|
|
# -----------------------------
|
|
|
|
# Read CSVs with different encodings and merge them based on guessed basenames, returning a unified DataFrame
|
|
def load_and_merge_csvs(csv_GBIF: str, csv_AV: str) -> pd.DataFrame:
|
|
def read_csv_any(path: str) -> pd.DataFrame:
|
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
|
try:
|
|
return pd.read_csv(path, encoding=enc)
|
|
except UnicodeDecodeError:
|
|
continue
|
|
return pd.read_csv(path, encoding="utf-8", errors="replace")
|
|
|
|
df_GBIF = read_csv_any(csv_GBIF)
|
|
df_AV = read_csv_any(csv_AV)
|
|
|
|
a_fname_col = first_existing_column(
|
|
df_GBIF,
|
|
["New_Name_With_Date", "New_Name", "Nombre_Nuevo", "Old_Name", "Nombre_Anterior", "Filename"],
|
|
)
|
|
if a_fname_col is None:
|
|
str_cols = [c for c in df_GBIF.columns if df_GBIF[c].dtype == object]
|
|
a_fname_col = str_cols[0] if str_cols else None
|
|
|
|
df_GBIF = df_GBIF.copy()
|
|
df_GBIF["basename_a"] = df_GBIF[a_fname_col].apply(guess_basename) if a_fname_col else None
|
|
|
|
b_base_col = first_existing_column(df_AV, ["basename", "basename_final", "basename_json", "basename_csv"])
|
|
if b_base_col is None:
|
|
b_fname_col = first_existing_column(
|
|
df_AV,
|
|
["New_Name_With_Date", "New_Name", "Nombre_Nuevo", "Old_Name", "Nombre_Anterior", "Filename"],
|
|
)
|
|
df_AV["basename_b"] = df_AV[b_fname_col].apply(guess_basename) if b_fname_col else None
|
|
else:
|
|
df_AV["basename_b"] = df_AV[b_base_col].apply(lambda x: str(x).strip() if pd.notna(x) else None)
|
|
|
|
merged = pd.merge(
|
|
df_GBIF, df_AV,
|
|
left_on="basename_a", right_on="basename_b",
|
|
how="outer", suffixes=("_a", "_b")
|
|
)
|
|
merged["basename"] = merged["basename_a"].fillna(merged["basename_b"])
|
|
return merged
|
|
|
|
# Attach filenames and full paths to DataFrame, verifying file existence, dropping missing files
|
|
def attach_filenames_and_paths(df: pd.DataFrame, images_dir: str, img_ext: str = ".jpg") -> pd.DataFrame:
|
|
rows = []
|
|
for _, row in df.iterrows():
|
|
fname = build_filename_from_row(row, img_ext=img_ext)
|
|
if fname is None:
|
|
rows.append(None)
|
|
continue
|
|
full_path = os.path.join(images_dir, fname)
|
|
rows.append((fname, full_path))
|
|
|
|
df = df.copy()
|
|
df["filename_path_tuple"] = rows
|
|
df["filename"] = df["filename_path_tuple"].apply(lambda t: t[0] if t else None)
|
|
df["path"] = df["filename_path_tuple"].apply(lambda t: t[1] if t else None)
|
|
df.drop(columns=["filename_path_tuple"], inplace=True)
|
|
|
|
df["exists"] = df["path"].apply(lambda p: os.path.exists(p) if isinstance(p, str) else False)
|
|
missing = (~df["exists"]).sum()
|
|
if missing > 0:
|
|
warnings.warn(f"{missing} archivos listados no existen en disco. Serán ignorados.")
|
|
return df[df["exists"]].reset_index(drop=True)
|
|
|
|
# -----------------------------
|
|
# Embeddings extraction
|
|
# -----------------------------
|
|
|
|
# Create preprocessing function based on backbone choice
|
|
def make_preprocess(backbone: str):
|
|
return mobilenet_preprocess if backbone == "mobilenet" else efficientnet_preprocess
|
|
|
|
# Create the CNN model without the top layer (feature extractor), returns a model that transforms each image into a feature vector 1280/2048
|
|
def make_backbone_model(img_size: int, backbone: str = "mobilenet") -> tf.keras.Model:
|
|
"""Crea el extractor de embeddings y hace fallback si EfficientNet falla con los pesos."""
|
|
tf.keras.backend.clear_session()
|
|
K.set_image_data_format("channels_last")
|
|
input_shape = (img_size, img_size, 3)
|
|
if backbone == "efficientnet":
|
|
try:
|
|
base = EfficientNetB0(include_top=False, weights="imagenet", input_shape=input_shape, pooling="avg")
|
|
except Exception as e:
|
|
warnings.warn(f"No se pudo cargar EfficientNetB0 con pesos ImageNet ({e}). "
|
|
f"Se usará EfficientNetB0 con pesos aleatorios (no preentrenado).")
|
|
base = EfficientNetB0(include_top=False, weights=None, input_shape=input_shape, pooling="avg")
|
|
else:
|
|
base = MobileNetV2(include_top=False, weights="imagenet", input_shape=input_shape, pooling="avg")
|
|
|
|
base.trainable = False
|
|
return base
|
|
|
|
# Create TensorFlow dataset efficient for read, decode, resize and preprocess images, and creates batches
|
|
def build_dataset(paths: List[str], img_size: int, preprocess_fn, batch_size: int = 64) -> tf.data.Dataset:
|
|
ds = tf.data.Dataset.from_tensor_slices(paths)
|
|
|
|
def _load_tf(p):
|
|
img_bytes = tf.io.read_file(p)
|
|
img = tf.image.decode_jpeg(img_bytes, channels=3)
|
|
img = tf.image.resize(img, [img_size, img_size], method="bilinear", antialias=True)
|
|
img = tf.cast(img, tf.float32)
|
|
img = preprocess_fn(img)
|
|
return img
|
|
|
|
ds = ds.map(_load_tf, num_parallel_calls=tf.data.AUTOTUNE)
|
|
ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
|
|
return ds
|
|
|
|
def compute_embeddings(model: tf.keras.Model, ds: tf.data.Dataset) -> np.ndarray:
|
|
return model.predict(ds, verbose=1)
|
|
|
|
# -----------------------------
|
|
# Reduction of dimensionality with PCA
|
|
# -----------------------------
|
|
|
|
# Scale the features with StandardScaler and fit PCA to reduce to n_pca dimensions, by default 50
|
|
def fit_reduction(train_emb: np.ndarray, n_pca: int = 50):
|
|
scaler = StandardScaler()
|
|
train_scaled = scaler.fit_transform(train_emb)
|
|
pca = PCA(n_components=min(n_pca, train_scaled.shape[1]))
|
|
train_pca = pca.fit_transform(train_scaled)
|
|
return scaler, pca, train_pca
|
|
|
|
# Transform the sets (val/test) using the fitted scaler and PCA of the train set
|
|
def transform_reduction(emb: np.ndarray, scaler: StandardScaler, pca: PCA) -> np.ndarray:
|
|
return pca.transform(scaler.transform(emb))
|
|
|
|
# Clustering and metrics, it let to choose between kmeans, dbscan and agglomerative, returns the model, labels and centers
|
|
def fit_cluster_algo(cluster: str, n_clusters: int, train_feats: np.ndarray, fast: bool = True):
|
|
if cluster == "kmeans":
|
|
if fast:
|
|
km = MiniBatchKMeans(n_clusters=n_clusters, batch_size=2048, n_init=10, random_state=42)
|
|
else:
|
|
km = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
|
|
km.fit(train_feats)
|
|
return km, km.labels_, km.cluster_centers_
|
|
elif cluster == "dbscan":
|
|
db = DBSCAN(eps=0.8, min_samples=5, n_jobs=-1)
|
|
db.fit(train_feats)
|
|
centers = []
|
|
labels = db.labels_
|
|
for c in sorted(set(labels)):
|
|
if c == -1:
|
|
continue
|
|
centers.append(train_feats[labels == c].mean(axis=0))
|
|
centers = np.array(centers) if centers else None
|
|
return db, labels, centers
|
|
else:
|
|
ag = AgglomerativeClustering(n_clusters=n_clusters)
|
|
labels = ag.fit_predict(train_feats)
|
|
centers = np.vstack([train_feats[labels == c].mean(axis=0) for c in range(n_clusters)])
|
|
return ag, labels, centers
|
|
|
|
# Assign new samples to nearest centroid based on Euclidean distance, util when the method does not support predict
|
|
def assign_to_nearest_centroid(feats: np.ndarray, centers: Optional[np.ndarray]) -> np.ndarray:
|
|
if centers is None or len(centers) == 0:
|
|
return np.full((feats.shape[0],), -1, dtype=int)
|
|
dists = ((feats[:, None, :] - centers[None, :, :]) ** 2).sum(axis=2)
|
|
return np.argmin(dists, axis=1)
|
|
|
|
# Compute internal clustering metrics: Silhouette, Calinski-Harabasz, Davies-Bouldin
|
|
def internal_metrics(X: np.ndarray, labels: np.ndarray) -> dict:
|
|
mask = labels != -1
|
|
res = {}
|
|
if mask.sum() > 1 and len(np.unique(labels[mask])) > 1:
|
|
res["silhouette"] = float(silhouette_score(X[mask], labels[mask]))
|
|
res["calinski_harabasz"] = float(calinski_harabasz_score(X[mask], labels[mask]))
|
|
res["davies_bouldin"] = float(davies_bouldin_score(X[mask], labels[mask]))
|
|
else:
|
|
res["silhouette"] = None
|
|
res["calinski_harabasz"] = None
|
|
res["davies_bouldin"] = None
|
|
return res
|
|
|
|
# -----------------------------
|
|
# Plot, visualization of clusters in 2D
|
|
# -----------------------------
|
|
|
|
# Scatter plot in 2D colored by cluster labels
|
|
def plot_scatter_2d(X2d: np.ndarray, labels: np.ndarray, title: str, out_path: str):
|
|
plt.figure(figsize=(8, 6))
|
|
palette = sns.color_palette("tab20", n_colors=max(2, len(np.unique(labels))))
|
|
sns.scatterplot(x=X2d[:, 0], y=X2d[:, 1], hue=labels, palette=palette, s=12, linewidth=0, legend=False)
|
|
plt.title(title)
|
|
plt.tight_layout()
|
|
plt.savefig(out_path, dpi=180)
|
|
plt.close()
|
|
|
|
# -----------------------------
|
|
# Main
|
|
# -----------------------------
|
|
|
|
# Argument parsing
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Unsupervised image clustering (rápido)")
|
|
parser.add_argument("--images_dir", default=r"C:\Users\sof12\Desktop\ML\Datasets\Nocciola_GBIF")
|
|
parser.add_argument("--csv_GBIF", default=r"C:\Users\sof12\Desktop\ML\Datasets\Nocciola_GBIF\change_namesAV.csv")
|
|
parser.add_argument("--csv_AV", default=r"C:\Users\sof12\Desktop\ML\Datasets\Nocciola_GBIF\metadatos_unidos.csv")
|
|
parser.add_argument("--out_dir", default=r"C:\Users\sof12\Desktop\ML\Datasets\Nocciola_GBIF\TrainingV4")
|
|
parser.add_argument("--img_ext", default=".jpg")
|
|
parser.add_argument("--img_size", type=int, default=224)
|
|
parser.add_argument("--batch_size", type=int, default=64)
|
|
parser.add_argument("--seed", type=int, default=42)
|
|
parser.add_argument("--sample", type=int, default=None)
|
|
parser.add_argument("--backbone", choices=["mobilenet", "efficientnet"], default="efficientnet")
|
|
parser.add_argument("--cluster", choices=["kmeans", "dbscan", "agglomerative"], default="kmeans")
|
|
parser.add_argument("--n_clusters", type=int, default=4)
|
|
parser.add_argument("--fast_kmeans", action="store_true", help="Usar MiniBatchKMeans para acelerar")
|
|
return parser.parse_args()
|
|
|
|
# Main function:
|
|
# 1) Load and merge CSVs
|
|
# 2) Divide into train/val/test
|
|
# 3) Compute embeddings (one pass)
|
|
# 4) Fit PCA reduction (50D and 2D)
|
|
# 5) Clustering
|
|
# 6) Compute internal metrics and save outputs
|
|
|
|
def main():
|
|
args = parse_args()
|
|
set_seed(args.seed)
|
|
ensure_dir(args.out_dir)
|
|
|
|
# 1) Load and merge CSVs
|
|
print("Loading and merging CSVs...")
|
|
merged = load_and_merge_csvs(args.csv_GBIF, args.csv_AV)
|
|
|
|
print("Resolving filenames and verifying files on disk...")
|
|
merged = attach_filenames_and_paths(merged, args.images_dir, img_ext=args.img_ext)
|
|
if len(merged) == 0:
|
|
print("No images found. Check images_dir and CSVs.")
|
|
return
|
|
|
|
# Standardize 'fase V' and 'fase R' if they exist with variant names
|
|
v_cols = find_matching_cols(merged, ["fase v", "fase_v", "fasev", "faseV", "Fase V"])
|
|
r_cols = find_matching_cols(merged, ["fase r", "fase_r", "faser", "faseR", "Fase R"])
|
|
|
|
if v_cols:
|
|
ser_v = None
|
|
for c in v_cols:
|
|
ser_v = merged[c] if ser_v is None else ser_v.combine_first(merged[c])
|
|
merged["fase V"] = ser_v
|
|
print(f"Using columns for 'fase V': {v_cols}")
|
|
else:
|
|
warnings.warn("No equivalent column found for 'fase V'.")
|
|
|
|
if r_cols:
|
|
ser_r = None
|
|
for c in r_cols:
|
|
ser_r = merged[c] if ser_r is None else ser_r.combine_first(merged[c])
|
|
merged["fase R"] = ser_r
|
|
print(f"Using columns for 'fase R': {r_cols}")
|
|
else:
|
|
warnings.warn("No equivalent column found for 'fase R'.")
|
|
|
|
# 2) Optional sampling
|
|
if args.sample is not None and args.sample < len(merged):
|
|
merged = merged.sample(n=args.sample, random_state=args.seed).reset_index(drop=True)
|
|
|
|
# 3) Split train/val/test
|
|
print("Splitting train/val/test...")
|
|
idx_all = np.arange(len(merged))
|
|
idx_train, idx_tmp = train_test_split(idx_all, test_size=0.30, random_state=args.seed, shuffle=True)
|
|
idx_val, idx_test = train_test_split(idx_tmp, test_size=0.50, random_state=args.seed, shuffle=True)
|
|
|
|
df_train = merged.iloc[idx_train].reset_index(drop=True)
|
|
df_val = merged.iloc[idx_val].reset_index(drop=True)
|
|
df_test = merged.iloc[idx_test].reset_index(drop=True)
|
|
|
|
# 4) Embeddings extraction (one pass for all images)
|
|
print("Building embedding model...")
|
|
preprocess_fn = make_preprocess(args.backbone)
|
|
model = make_backbone_model(args.img_size, backbone=args.backbone)
|
|
|
|
print("Computing embeddings (one pass for all images)...")
|
|
ds_all = build_dataset(merged["path"].tolist(), args.img_size, preprocess_fn, args.batch_size)
|
|
emb_all = compute_embeddings(model, ds_all)
|
|
|
|
emb_train = emb_all[idx_train]
|
|
emb_val = emb_all[idx_val]
|
|
emb_test = emb_all[idx_test]
|
|
|
|
# 5) Dimensionality reduction
|
|
print("Fitting PCA reduction (50D for clustering, 2D for plots)...")
|
|
scaler, pca50, train_50 = fit_reduction(emb_train, n_pca=50)
|
|
val_50 = transform_reduction(emb_val, scaler, pca50)
|
|
test_50 = transform_reduction(emb_test, scaler, pca50)
|
|
|
|
pca2 = PCA(n_components=2).fit(scaler.transform(emb_train))
|
|
train_2d = pca2.transform(scaler.transform(emb_train))
|
|
val_2d = pca2.transform(scaler.transform(emb_val))
|
|
test_2d = pca2.transform(scaler.transform(emb_test))
|
|
|
|
# 6) Clustering
|
|
print(f"Clustering with {args.cluster}...")
|
|
cluster_model, y_train_clusters, centers = fit_cluster_algo(
|
|
args.cluster, args.n_clusters, train_50, fast=args.fast_kmeans or args.cluster == "kmeans"
|
|
)
|
|
|
|
if args.cluster == "kmeans":
|
|
y_val_clusters = cluster_model.predict(val_50)
|
|
y_test_clusters = cluster_model.predict(test_50)
|
|
else:
|
|
y_val_clusters = assign_to_nearest_centroid(val_50, centers)
|
|
y_test_clusters = assign_to_nearest_centroid(test_50, centers)
|
|
|
|
# 7) Internal metrics (optional for summary)
|
|
print("Computing internal metrics...")
|
|
train_internal = internal_metrics(train_50, y_train_clusters)
|
|
val_internal = internal_metrics(val_50, y_val_clusters)
|
|
test_internal = internal_metrics(test_50, y_test_clusters)
|
|
|
|
# 8) Save minimal requested outputs
|
|
print("Saving outputs...")
|
|
ensure_dir(args.out_dir)
|
|
|
|
def pick_min_columns(df_split: pd.DataFrame, clusters: np.ndarray, split_name: str) -> pd.DataFrame:
|
|
cols_wanted = ["filename", "fase V", "fase R"]
|
|
cols_exist = [c for c in cols_wanted if c in df_split.columns]
|
|
missing = [c for c in cols_wanted if c not in df_split.columns]
|
|
if missing:
|
|
warnings.warn(f"Columnas faltantes en {split_name}: {missing}")
|
|
out = df_split[cols_exist].copy()
|
|
out["cluster"] = clusters
|
|
out["split"] = split_name
|
|
return out
|
|
|
|
train_min = pick_min_columns(df_train, y_train_clusters, "train")
|
|
val_min = pick_min_columns(df_val, y_val_clusters, "val")
|
|
test_min = pick_min_columns(df_test, y_test_clusters, "test")
|
|
|
|
assignments_all = pd.concat([train_min, val_min, test_min], ignore_index=True)
|
|
assignments_all.to_csv(os.path.join(args.out_dir, "assignments.csv"), index=False, encoding="utf-8")
|
|
|
|
train_min.to_csv(os.path.join(args.out_dir, "train_assignments.csv"), index=False, encoding="utf-8")
|
|
val_min.to_csv(os.path.join(args.out_dir, "val_assignments.csv"), index=False, encoding="utf-8")
|
|
test_min.to_csv(os.path.join(args.out_dir, "test_assignments.csv"), index=False, encoding="utf-8")
|
|
|
|
# Models for reproducibility
|
|
joblib.dump(scaler, os.path.join(args.out_dir, "scaler.joblib"))
|
|
joblib.dump(pca50, os.path.join(args.out_dir, "pca50.joblib"))
|
|
joblib.dump(pca2, os.path.join(args.out_dir, "pca2.joblib"))
|
|
joblib.dump(cluster_model, os.path.join(args.out_dir, f"{args.cluster}.joblib"))
|
|
|
|
# Plots 2D
|
|
plot_scatter_2d(train_2d, y_train_clusters, f"Train clusters ({args.cluster})", os.path.join(args.out_dir, "train_clusters_2d.png"))
|
|
plot_scatter_2d(val_2d, y_val_clusters, f"Val clusters ({args.cluster})", os.path.join(args.out_dir, "val_clusters_2d.png"))
|
|
plot_scatter_2d(test_2d, y_test_clusters, f"Test clusters ({args.cluster})", os.path.join(args.out_dir, "test_clusters_2d.png"))
|
|
|
|
summary = {
|
|
"counts": {"train": len(df_train), "val": len(df_val), "test": len(df_test)},
|
|
"cluster": args.cluster,
|
|
"n_clusters": args.n_clusters,
|
|
"backbone": args.backbone,
|
|
"img_size": args.img_size,
|
|
"internal_metrics": {"train": train_internal, "val": val_internal, "test": test_internal},
|
|
"output_files": {
|
|
"all": os.path.join(args.out_dir, "assignments.csv"),
|
|
"train": os.path.join(args.out_dir, "train_assignments.csv"),
|
|
"val": os.path.join(args.out_dir, "val_assignments.csv"),
|
|
"test": os.path.join(args.out_dir, "test_assignments.csv"),
|
|
},
|
|
}
|
|
with open(os.path.join(args.out_dir, "summary.json"), "w", encoding="utf-8") as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
|
|
print("Done. Results saved to:", args.out_dir)
|
|
|
|
np.save(os.path.join(args.out_dir, "features.npy"), emb_all)
|
|
np.save(os.path.join(args.out_dir, "feature_paths.npy"), merged["path"].to_numpy())
|
|
print(f"Features guardadas en {args.out_dir}\\features.npy y feature_paths.npy")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |