Phenology/Code/Unsupervised_learning/Clustering_Carciofo.py
2025-11-25 11:30:37 +01:00

775 lines
28 KiB
Python

"""
Clustering Non supervised for Hazelnut (Nocciola)
Pipeline complete:
1. Feature extraction with ResNet50/EfficientNet
2. Dimensionality reduction (PCA + UMAP)
3. Hierarchical and optimized K-Means clustering
4. Advanced visualization and results analysis
5. Comparison with real labels (external validation)
"""
import os
import random
import warnings
import json
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Deep Learning
import tensorflow as tf
from tensorflow.keras.applications import ResNet50, EfficientNetB0
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.efficientnet import preprocess_input as efficient_preprocess
from tensorflow.keras.preprocessing import image
# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.manifold import TSNE
# UMAP for better visualization
try:
import umap
UMAP_AVAILABLE = True
except ImportError:
print(" Not available UMAP. Install it with: pip install umap-learn")
UMAP_AVAILABLE = False
# Hierarchy clustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
warnings.filterwarnings('ignore')
# =============================================================================
# CONFIGURACIÓN
# =============================================================================
# Roots
CSV_PATH = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\tags.csv'
IMAGES_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow'
OUTPUT_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\results_clustering_avanzado_8C'
# Processing parameters
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
SEED = 42
# Data splitting parameters
SPLIT_RATIO = {
'train': 0.70,
'val': 0.15,
'test': 0.15
}
# Clustering parameters
N_CLUSTERS_RANGE = range(8, 15) # Test from 4 to 14 clusters
PCA_COMPONENTS = 50 # PCA components for analysis
UMAP_COMPONENTS = 2 # For visualization
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
# =============================================================================
# UTILITIES
# =============================================================================
def set_seed(seed=42):
"""Set seed for reproducibility"""
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
print(f"Seed set: {seed}")
def log_message(message, level='INFO'):
"""Logging with timestamp"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"[{timestamp}] [{level}] {message}")
def safe_read_csv(path):
"""Read CSV with encoding handling"""
if not os.path.exists(path):
raise FileNotFoundError(f'CSV not found: {path}')
for encoding in ['utf-8', 'latin-1', 'cp1252']:
try:
df = pd.read_csv(path, encoding=encoding)
log_message(f"CSV read successfully with encoding: {encoding}")
return df
except UnicodeDecodeError:
continue
raise ValueError("Could not read CSV with any encoding")
def find_image_path(images_dir, img_name):
"""Find full path of an image"""
if pd.isna(img_name) or str(img_name).strip() == '':
return None
img_name = str(img_name).strip()
for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
# Without extension
img_path = os.path.join(images_dir, os.path.splitext(img_name)[0] + ext)
if os.path.exists(img_path):
return img_path
# With direct name
img_path = os.path.join(images_dir, img_name)
if os.path.exists(img_path):
return img_path
# Search recursively in subdirectories
for root, dirs, files in os.walk(images_dir):
for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
img_path = os.path.join(root, os.path.splitext(img_name)[0] + ext)
if os.path.exists(img_path):
return img_path
return None
# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
class DatasetLoader:
"""Class to load and prepare the dataset"""
def __init__(self, csv_path, images_dir):
self.csv_path = csv_path
self.images_dir = images_dir
self.df = None
self.df_clean = None
def load_and_clean(self):
"""Load and clean data"""
log_message("Loading dataset...")
# Read CSV
self.df = safe_read_csv(self.csv_path)
log_message(f"Original dataset: {len(self.df)} rows")
# Show available columns
log_message(f"Available columns: {list(self.df.columns)}")
# Check required columns
required_cols = ['fase']
available_cols = []
for col in self.df.columns:
if 'fase' in col.lower():
available_cols.append(('fase_P', col))
log_message(f"Phases columns detected: {available_cols}")
# Identify image column
img_col = None
for col in ['id_img', 'imagen', 'image', 'filename', 'file']:
if col in self.df.columns:
img_col = col
break
if img_col is None:
# Try with the first column that looks like file names
for col in self.df.columns:
if self.df[col].dtype == 'object':
sample = str(self.df[col].iloc[0])
if any(ext in sample.lower() for ext in ['.jpg', '.png', '.jpeg']):
img_col = col
break
if img_col is None:
raise ValueError("Could not identify the image column")
log_message(f"Image column: {img_col}")
# Filter valid rows
valid_rows = []
for idx, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Verifying images"):
img_path = find_image_path(self.images_dir, row[img_col])
if img_path:
valid_row = {
'image_name': os.path.basename(img_path),
'image_path': img_path,
'fase_P': row.get('fase', np.nan)
}
valid_rows.append(valid_row)
self.df_clean = pd.DataFrame(valid_rows)
log_message(f"Valid images found: {len(self.df_clean)}")
# Distribution analysis
self.analyze_distribution()
return self.df_clean
def analyze_distribution(self):
"""Analyze phase distribution"""
log_message("\n=== Phase Distribution Analysis ===")
if 'fase_P' in self.df_clean.columns:
fase_p_counts = self.df_clean['fase_P'].value_counts()
log_message(f"\nfase ({len(fase_p_counts)} classes):")
for fase, count in fase_p_counts.items():
print(f" {fase}: {count} images")
def split_data(self, split_ratio=SPLIT_RATIO):
"""Dividing data in train/val/test"""
log_message("\n🔀 Dividing data...")
# Shuffle data
df_shuffled = self.df_clean.sample(frac=1, random_state=SEED).reset_index(drop=True)
n = len(df_shuffled)
n_train = int(n * split_ratio['train'])
n_val = int(n * split_ratio['val'])
train_df = df_shuffled.iloc[:n_train].copy()
val_df = df_shuffled.iloc[n_train:n_train + n_val].copy()
test_df = df_shuffled.iloc[n_train + n_val:].copy()
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'
log_message(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
# Guardar splits
split_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
split_df.to_csv(os.path.join(OUTPUT_DIR, 'data_splits.csv'), index=False)
return train_df, val_df, test_df, split_df
# =============================================================================
# FEATURE EXTRACTION
# =============================================================================
class FeatureExtractor:
"""Feature extractor using pre-trained CNNs"""
def __init__(self, model_name='resnet50'):
self.model_name = model_name
self.model = None
self.preprocess_fn = None
self._build_model()
def _build_model(self):
"""Construct the feature extractor model"""
log_message(f"Building feature extractor: {self.model_name}")
if self.model_name.lower() == 'resnet50':
self.model = ResNet50(
weights='imagenet',
include_top=False,
pooling='avg',
input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
)
self.preprocess_fn = resnet_preprocess
elif self.model_name.lower() == 'efficientnet':
self.model = EfficientNetB0(
weights='imagenet',
include_top=False,
pooling='avg',
input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
)
self.preprocess_fn = efficient_preprocess
else:
raise ValueError(f"Not supported model: {self.model_name}")
log_message(f"Model built. Output shape: {self.model.output_shape}")
def load_and_preprocess_image(self, img_path):
"""Load and preprocess an image"""
try:
img = image.load_img(img_path, target_size=IMG_SIZE)
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array = self.preprocess_fn(img_array)
return img_array
except Exception as e:
log_message(f"Error loading image {img_path}: {e}", level='ERROR')
return None
def extract_features(self, df):
"""Extract features from all images"""
log_message(f"\nExtracting features from {len(df)} images...")
features_list = []
valid_indices = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
img_array = self.load_and_preprocess_image(row['image_path'])
if img_array is not None:
features = self.model.predict(img_array, verbose=0)
features_list.append(features.flatten())
valid_indices.append(idx)
features_array = np.array(features_list)
log_message(f"Features extracted: {features_array.shape}")
return features_array, valid_indices
# =============================================================================
# CLUSTERING AND ANALYSIS
# =============================================================================
class ClusteringAnalyzer:
"""Advanced clustering analysis"""
def __init__(self, features, df, output_dir):
self.features = features
self.df = df
self.output_dir = output_dir
self.scaler = StandardScaler()
self.features_scaled = None
self.pca = None
self.features_pca = None
self.umap_model = None
self.features_umap = None
def preprocess_features(self):
"""Preprocess features"""
log_message("\nPreprocessing features...")
# Scale
self.features_scaled = self.scaler.fit_transform(self.features)
log_message(f"Features scaled: {self.features_scaled.shape}")
# PCA
self.pca = PCA(n_components=PCA_COMPONENTS, random_state=SEED)
self.features_pca = self.pca.fit_transform(self.features_scaled)
variance_explained = np.sum(self.pca.explained_variance_ratio_)
log_message(f"PCA: {PCA_COMPONENTS} components explain {variance_explained:.2%} of variance")
# UMAP (if available)
if UMAP_AVAILABLE:
log_message("Applying UMAP for visualization...")
self.umap_model = umap.UMAP(
n_components=UMAP_COMPONENTS,
n_neighbors=15,
min_dist=0.1,
metric='euclidean',
random_state=SEED
)
self.features_umap = self.umap_model.fit_transform(self.features_pca)
log_message(f"UMAP completed: {self.features_umap.shape}")
else:
# Use t-SNE as an alternative
log_message("Applying t-SNE for visualization...")
tsne = TSNE(n_components=2, random_state=SEED, perplexity=30)
self.features_umap = tsne.fit_transform(self.features_pca)
log_message(f"t-SNE completed: {self.features_umap.shape}")
def find_optimal_clusters(self):
"""Find optimal number of clusters"""
log_message("\nSearching for optimal number of clusters...")
metrics = {
'n_clusters': [],
'silhouette': [],
'calinski_harabasz': [],
'davies_bouldin': []
}
for n in tqdm(N_CLUSTERS_RANGE, desc="Evaluating clusters"):
kmeans = KMeans(n_clusters=n, random_state=SEED, n_init=10)
labels = kmeans.fit_predict(self.features_pca)
metrics['n_clusters'].append(n)
metrics['silhouette'].append(silhouette_score(self.features_pca, labels))
metrics['calinski_harabasz'].append(calinski_harabasz_score(self.features_pca, labels))
metrics['davies_bouldin'].append(davies_bouldin_score(self.features_pca, labels))
# Visualize metrics
self.plot_clustering_metrics(metrics)
# Find optimal (silhouette score)
best_idx = np.argmax(metrics['silhouette'])
best_n = metrics['n_clusters'][best_idx]
log_message(f"Optimal number of clusters (Silhouette): {best_n}")
return best_n, metrics
def perform_clustering(self, n_clusters):
"""Perform clustering with different algorithms"""
log_message(f"\nApplying clustering with {n_clusters} clusters...")
results = {}
# K-Means
log_message("Applying K-Means...")
kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=20)
results['kmeans'] = kmeans.fit_predict(self.features_pca)
# Hierarchical Clustering
log_message("Applying Hierarchical Clustering...")
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
results['hierarchical'] = hierarchical.fit_predict(self.features_pca)
# Save results
for method, labels in results.items():
self.df[f'cluster_{method}'] = labels
log_message(f"Clustering completed: {len(results)} methods")
return results
def evaluate_clustering(self, labels, method_name):
"""Evaluate clustering quality"""
silhouette = silhouette_score(self.features_pca, labels)
calinski = calinski_harabasz_score(self.features_pca, labels)
davies = davies_bouldin_score(self.features_pca, labels)
log_message(f"\nMetrics for {method_name}:")
log_message(f" Silhouette Score: {silhouette:.4f}")
log_message(f" Calinski-Harabasz: {calinski:.4f}")
log_message(f" Davies-Bouldin: {davies:.4f}")
# Compare with true labels if they exist
if 'fase_P' in self.df.columns:
# Filter NaN
valid_mask = ~self.df['fase_P'].isna()
if valid_mask.sum() > 0:
true_labels = pd.Categorical(self.df.loc[valid_mask, 'fase_P']).codes
pred_labels = labels[valid_mask]
ari = adjusted_rand_score(true_labels, pred_labels)
nmi = normalized_mutual_info_score(true_labels, pred_labels)
log_message(f"\n External Validation (vs fase):")
log_message(f" Adjusted Rand Index: {ari:.4f}")
log_message(f" Normalized Mutual Info: {nmi:.4f}")
return {
'silhouette': silhouette,
'calinski_harabasz': calinski,
'davies_bouldin': davies
}
def plot_clustering_metrics(self, metrics):
"""Visualize clustering metrics"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Clustering Evaluation Metrics', fontsize=16, fontweight='bold')
# Silhouette Score
axes[0, 0].plot(metrics['n_clusters'], metrics['silhouette'], 'b-o', linewidth=2)
axes[0, 0].set_xlabel('NNumber of Clusters')
axes[0, 0].set_ylabel('Silhouette Score')
axes[0, 0].set_title('Silhouette Score (higher is better)')
axes[0, 0].grid(True, alpha=0.3)
# Calinski-Harabasz
axes[0, 1].plot(metrics['n_clusters'], metrics['calinski_harabasz'], 'g-o', linewidth=2)
axes[0, 1].set_xlabel('Number of Clusters')
axes[0, 1].set_ylabel('Calinski-Harabasz Score')
axes[0, 1].set_title('Calinski-Harabasz Score (higher is better)')
axes[0, 1].grid(True, alpha=0.3)
# Davies-Bouldin
axes[1, 0].plot(metrics['n_clusters'], metrics['davies_bouldin'], 'r-o', linewidth=2)
axes[1, 0].set_xlabel('Number of Clusters')
axes[1, 0].set_ylabel('Davies-Bouldin Score')
axes[1, 0].set_title('Davies-Bouldin Score (lower is better)')
axes[1, 0].grid(True, alpha=0.3)
# Summary
axes[1, 1].axis('off')
best_silhouette = metrics['n_clusters'][np.argmax(metrics['silhouette'])]
best_calinski = metrics['n_clusters'][np.argmax(metrics['calinski_harabasz'])]
best_davies = metrics['n_clusters'][np.argmin(metrics['davies_bouldin'])]
summary_text = f"""
Optimal Number of Clusters:
Silhouette Score: {best_silhouette}
Calinski-Harabasz: {best_calinski}
Davies-Bouldin: {best_davies}
Recomendación: {best_silhouette} clusters
(basado en Silhouette Score)
"""
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=12, verticalalignment='center',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'clustering_metrics.png'), dpi=300, bbox_inches='tight')
log_message(f" saved: clustering_metrics.png")
plt.close()
def visualize_clusters(self, labels, method_name, title_suffix=""):
"""Visualize clusters in 2D space"""
if self.features_umap is None:
log_message("⚠️ No reduced features available for visualization", level='WARNING')
return
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle(f'Clusters Visualization - {method_name} {title_suffix}',
fontsize=16, fontweight='bold')
# Plot 1: Clusters
scatter = axes[0].scatter(
self.features_umap[:, 0],
self.features_umap[:, 1],
c=labels,
cmap='tab20',
s=50,
alpha=0.6,
edgecolors='black',
linewidth=0.5
)
axes[0].set_xlabel('UMAP/t-SNE Component 1', fontsize=12)
axes[0].set_ylabel('UMAP/t-SNE Component 2', fontsize=12)
axes[0].set_title('Clusters Found', fontsize=14)
axes[0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[0], label='Cluster ID')
# Plot 2: Real phases (if available)
if 'fase_P' in self.df.columns:
fase_p_codes = pd.Categorical(self.df['fase_P']).codes
scatter2 = axes[1].scatter(
self.features_umap[:, 0],
self.features_umap[:, 1],
c=fase_p_codes,
cmap='viridis',
s=50,
alpha=0.6,
edgecolors='black',
linewidth=0.5
)
axes[1].set_xlabel('UMAP/t-SNE Component 1', fontsize=12)
axes[1].set_ylabel('UMAP/t-SNE Component 2', fontsize=12)
axes[1].set_title('Real Phases (Phase V)', fontsize=14)
axes[1].grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=axes[1], label='Phase V')
else:
axes[1].axis('off')
axes[1].text(0.5, 0.5, 'No real labels available',
ha='center', va='center', fontsize=14)
plt.tight_layout()
filename = f'clusters_visualization_{method_name}.png'
plt.savefig(os.path.join(self.output_dir, filename), dpi=300, bbox_inches='tight')
log_message(f"Saved visualization: {filename}")
plt.close()
def plot_dendrogram(self):
"""Plot hierarchical dendrogram"""
log_message("\n Generating dendrogram...")
# Calculate linkage
linkage_matrix = linkage(self.features_pca, method='ward')
plt.figure(figsize=(20, 10))
dendrogram(
linkage_matrix,
truncate_mode='lastp',
p=30,
leaf_rotation=90,
leaf_font_size=10,
show_contracted=True
)
plt.title('Hierarchical Clustering Dendrogram', fontsize=16, fontweight='bold')
plt.xlabel('Sample Index', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'dendrogram.png'), dpi=300, bbox_inches='tight')
log_message(f"Dendrogram saved: dendrogram.png")
plt.close()
def analyze_cluster_composition(self, labels, method_name):
"""Analyze cluster composition vs real phases"""
if 'fase_P' not in self.df.columns:
return
log_message(f"\nAnalyzing cluster composition ({method_name})...")
# Create contingency table
contingency = pd.crosstab(
labels,
self.df['fase_P'],
margins=True
)
# Guardar tabla
contingency.to_csv(
os.path.join(self.output_dir, f'cluster_composition_{method_name}.csv')
)
# Visualizar heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(
contingency.iloc[:-1, :-1], # Sin márgenes
annot=True,
fmt='d',
cmap='YlOrRd',
cbar_kws={'label': 'Number of Images'}
)
plt.title(f'Cluster Composition vs Phase V - {method_name}',
fontsize=14, fontweight='bold')
plt.xlabel('Phase V (Real)', fontsize=12)
plt.ylabel('Cluster ID', fontsize=12)
plt.tight_layout()
plt.savefig(
os.path.join(self.output_dir, f'cluster_heatmap_{method_name}.png'),
dpi=300,
bbox_inches='tight'
)
log_message(f"Heatmap saved: cluster_heatmap_{method_name}.png")
plt.close()
# =============================================================================
# MAIN PIPELINE
# =============================================================================
def main():
"""Main clustering pipeline"""
print("\n" + "="*80)
print("🎓 ADVANCED UNSUPERVISED CLUSTERING - NOCCIOLA")
print("="*80 + "\n")
# Set seed
set_seed(SEED)
# 1. LOAD DATA
log_message("="*80)
log_message("STEP 1: DATA LOADING AND PREPARATION")
log_message("="*80)
loader = DatasetLoader(CSV_PATH, IMAGES_DIR)
df_clean = loader.load_and_clean()
if len(df_clean) == 0:
log_message("No valid data found", level='ERROR')
return
train_df, val_df, test_df, full_df = loader.split_data()
# 2. EXTRACT FEATURES
log_message("\n" + "="*80)
log_message("STEP 2: FEATURE EXTRACTION")
log_message("="*80)
extractor = FeatureExtractor(model_name='resnet50')
# Extract for all data
features, valid_indices = extractor.extract_features(full_df)
# Filter DataFrame for valid indices only
df_with_features = full_df.iloc[valid_indices].reset_index(drop=True)
# Save features
np.save(os.path.join(OUTPUT_DIR, 'features.npy'), features)
df_with_features.to_csv(os.path.join(OUTPUT_DIR, 'data_with_features.csv'), index=False)
# 3. CLUSTERING AND ANALYSIS
log_message("\n" + "="*80)
log_message("STEP 3: CLUSTERING AND ANALYSIS")
log_message("="*80)
analyzer = ClusteringAnalyzer(features, df_with_features, OUTPUT_DIR)
analyzer.preprocess_features()
# Find optimal number of clusters
optimal_n, metrics = analyzer.find_optimal_clusters()
# Clustering with optimal number
clustering_results = analyzer.perform_clustering(optimal_n)
# 4. EVALUATION AND VISUALIZATION
log_message("\n" + "="*80)
log_message("STEP 4: EVALUATION AND VISUALIZATION")
log_message("="*80)
# Dendrogram
analyzer.plot_dendrogram()
# Evaluate each method
evaluation_results = {}
for method, labels in clustering_results.items():
metrics = analyzer.evaluate_clustering(labels, method)
evaluation_results[method] = metrics
# Visualizar
analyzer.visualize_clusters(labels, method)
analyzer.analyze_cluster_composition(labels, method)
# 5. SAVE FINAL RESULTS
log_message("\n" + "="*80)
log_message("STEP 5: SAVE FINAL RESULTS")
log_message("="*80)
# Final CSV with all information
output_df = df_with_features[[
'image_name', 'fase_P', 'split',
'cluster_kmeans', 'cluster_hierarchical'
]].copy()
# Rename for clarity
output_df = output_df.rename(columns={
'image_name': 'image',
'fase_P': 'phase_P',
'cluster_kmeans': 'cluster_kmeans',
'cluster_hierarchical': 'cluster_hierarchical'
})
output_csv_path = os.path.join(OUTPUT_DIR, 'clustering_results.csv')
output_df.to_csv(output_csv_path, index=False)
log_message(f"Results saved: {output_csv_path}")
# Save summary JSON
summary = {
'execution_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'total_images': len(df_with_features),
'num_clusters_optimal': int(optimal_n),
'clustering_methods': list(clustering_results.keys()),
'evaluation': {k: {kk: float(vv) for kk, vv in v.items()}
for k, v in evaluation_results.items()},
'split_distribution': {
'train': int((df_with_features['split'] == 'train').sum()),
'val': int((df_with_features['split'] == 'val').sum()),
'test': int((df_with_features['split'] == 'test').sum())
}
}
with open(os.path.join(OUTPUT_DIR, 'summary_clustering.json'), 'w') as f:
json.dump(summary, f, indent=2)
# 6. FINAL SUMMARY
print("\n" + "="*80)
print("PIPELINE COMPLETADO EXITOSAMENTE")
print("="*80)
print(f"\nResults saved in: {OUTPUT_DIR}")
print(f"\nSummary:")
print(f" - Images processed: {len(df_with_features)}")
print(f" - Clusters found: {optimal_n}")
print(f" - Train: {summary['split_distribution']['train']}")
print(f" - Val: {summary['split_distribution']['val']}")
print(f" - Test: {summary['split_distribution']['test']}")
print(f"\nFiles generated:")
print(f" - clustering_results.csv (Main CSV)")
print(f" - summary_clustering.json (Detailed summary)")
print(f" - features.npy (Extracted features)")
print(f" - clustering_metrics.png (Evaluation metrics)")
print(f" - clusters_visualization_*.png (Visualizations)")
print(f" - dendrogram.png (Hierarchical dendrogram)")
print(f" - cluster_heatmap_*.png (Cluster composition)")
print("="*80 + "\n")
if __name__ == '__main__':
main()