""" Clustering Non supervised for Hazelnut (Nocciola) Pipeline complete: 1. Feature extraction with ResNet50/EfficientNet 2. Dimensionality reduction (PCA + UMAP) 3. Hierarchical and optimized K-Means clustering 4. Advanced visualization and results analysis 5. Comparison with real labels (external validation) """ import os import random import warnings import json from pathlib import Path from datetime import datetime import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from tqdm import tqdm # Deep Learning import tensorflow as tf from tensorflow.keras.applications import ResNet50, EfficientNetB0 from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess from tensorflow.keras.applications.efficientnet import preprocess_input as efficient_preprocess from tensorflow.keras.preprocessing import image # Machine Learning from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score from sklearn.manifold import TSNE # UMAP for better visualization try: import umap UMAP_AVAILABLE = True except ImportError: print(" Not available UMAP. Install it with: pip install umap-learn") UMAP_AVAILABLE = False # Hierarchy clustering from scipy.cluster.hierarchy import dendrogram, linkage from scipy.spatial.distance import pdist warnings.filterwarnings('ignore') # ============================================================================= # CONFIGURACIÓN # ============================================================================= # Roots CSV_PATH = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\tags.csv' IMAGES_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow' OUTPUT_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\results_clustering_avanzado_8C' # Processing parameters IMG_SIZE = (224, 224) BATCH_SIZE = 32 SEED = 42 # Data splitting parameters SPLIT_RATIO = { 'train': 0.70, 'val': 0.15, 'test': 0.15 } # Clustering parameters N_CLUSTERS_RANGE = range(8, 15) # Test from 4 to 14 clusters PCA_COMPONENTS = 50 # PCA components for analysis UMAP_COMPONENTS = 2 # For visualization # Create output directory os.makedirs(OUTPUT_DIR, exist_ok=True) # ============================================================================= # UTILITIES # ============================================================================= def set_seed(seed=42): """Set seed for reproducibility""" random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) print(f"Seed set: {seed}") def log_message(message, level='INFO'): """Logging with timestamp""" timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"[{timestamp}] [{level}] {message}") def safe_read_csv(path): """Read CSV with encoding handling""" if not os.path.exists(path): raise FileNotFoundError(f'CSV not found: {path}') for encoding in ['utf-8', 'latin-1', 'cp1252']: try: df = pd.read_csv(path, encoding=encoding) log_message(f"CSV read successfully with encoding: {encoding}") return df except UnicodeDecodeError: continue raise ValueError("Could not read CSV with any encoding") def find_image_path(images_dir, img_name): """Find full path of an image""" if pd.isna(img_name) or str(img_name).strip() == '': return None img_name = str(img_name).strip() for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']: # Without extension img_path = os.path.join(images_dir, os.path.splitext(img_name)[0] + ext) if os.path.exists(img_path): return img_path # With direct name img_path = os.path.join(images_dir, img_name) if os.path.exists(img_path): return img_path # Search recursively in subdirectories for root, dirs, files in os.walk(images_dir): for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']: img_path = os.path.join(root, os.path.splitext(img_name)[0] + ext) if os.path.exists(img_path): return img_path return None # ============================================================================= # LOAD AND PREPARE DATA # ============================================================================= class DatasetLoader: """Class to load and prepare the dataset""" def __init__(self, csv_path, images_dir): self.csv_path = csv_path self.images_dir = images_dir self.df = None self.df_clean = None def load_and_clean(self): """Load and clean data""" log_message("Loading dataset...") # Read CSV self.df = safe_read_csv(self.csv_path) log_message(f"Original dataset: {len(self.df)} rows") # Show available columns log_message(f"Available columns: {list(self.df.columns)}") # Check required columns required_cols = ['fase'] available_cols = [] for col in self.df.columns: if 'fase' in col.lower(): available_cols.append(('fase_P', col)) log_message(f"Phases columns detected: {available_cols}") # Identify image column img_col = None for col in ['id_img', 'imagen', 'image', 'filename', 'file']: if col in self.df.columns: img_col = col break if img_col is None: # Try with the first column that looks like file names for col in self.df.columns: if self.df[col].dtype == 'object': sample = str(self.df[col].iloc[0]) if any(ext in sample.lower() for ext in ['.jpg', '.png', '.jpeg']): img_col = col break if img_col is None: raise ValueError("Could not identify the image column") log_message(f"Image column: {img_col}") # Filter valid rows valid_rows = [] for idx, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Verifying images"): img_path = find_image_path(self.images_dir, row[img_col]) if img_path: valid_row = { 'image_name': os.path.basename(img_path), 'image_path': img_path, 'fase_P': row.get('fase', np.nan) } valid_rows.append(valid_row) self.df_clean = pd.DataFrame(valid_rows) log_message(f"Valid images found: {len(self.df_clean)}") # Distribution analysis self.analyze_distribution() return self.df_clean def analyze_distribution(self): """Analyze phase distribution""" log_message("\n=== Phase Distribution Analysis ===") if 'fase_P' in self.df_clean.columns: fase_p_counts = self.df_clean['fase_P'].value_counts() log_message(f"\nfase ({len(fase_p_counts)} classes):") for fase, count in fase_p_counts.items(): print(f" {fase}: {count} images") def split_data(self, split_ratio=SPLIT_RATIO): """Dividing data in train/val/test""" log_message("\n🔀 Dividing data...") # Shuffle data df_shuffled = self.df_clean.sample(frac=1, random_state=SEED).reset_index(drop=True) n = len(df_shuffled) n_train = int(n * split_ratio['train']) n_val = int(n * split_ratio['val']) train_df = df_shuffled.iloc[:n_train].copy() val_df = df_shuffled.iloc[n_train:n_train + n_val].copy() test_df = df_shuffled.iloc[n_train + n_val:].copy() train_df['split'] = 'train' val_df['split'] = 'val' test_df['split'] = 'test' log_message(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}") # Guardar splits split_df = pd.concat([train_df, val_df, test_df], ignore_index=True) split_df.to_csv(os.path.join(OUTPUT_DIR, 'data_splits.csv'), index=False) return train_df, val_df, test_df, split_df # ============================================================================= # FEATURE EXTRACTION # ============================================================================= class FeatureExtractor: """Feature extractor using pre-trained CNNs""" def __init__(self, model_name='resnet50'): self.model_name = model_name self.model = None self.preprocess_fn = None self._build_model() def _build_model(self): """Construct the feature extractor model""" log_message(f"Building feature extractor: {self.model_name}") if self.model_name.lower() == 'resnet50': self.model = ResNet50( weights='imagenet', include_top=False, pooling='avg', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3) ) self.preprocess_fn = resnet_preprocess elif self.model_name.lower() == 'efficientnet': self.model = EfficientNetB0( weights='imagenet', include_top=False, pooling='avg', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3) ) self.preprocess_fn = efficient_preprocess else: raise ValueError(f"Not supported model: {self.model_name}") log_message(f"Model built. Output shape: {self.model.output_shape}") def load_and_preprocess_image(self, img_path): """Load and preprocess an image""" try: img = image.load_img(img_path, target_size=IMG_SIZE) img_array = image.img_to_array(img) img_array = np.expand_dims(img_array, axis=0) img_array = self.preprocess_fn(img_array) return img_array except Exception as e: log_message(f"Error loading image {img_path}: {e}", level='ERROR') return None def extract_features(self, df): """Extract features from all images""" log_message(f"\nExtracting features from {len(df)} images...") features_list = [] valid_indices = [] for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"): img_array = self.load_and_preprocess_image(row['image_path']) if img_array is not None: features = self.model.predict(img_array, verbose=0) features_list.append(features.flatten()) valid_indices.append(idx) features_array = np.array(features_list) log_message(f"Features extracted: {features_array.shape}") return features_array, valid_indices # ============================================================================= # CLUSTERING AND ANALYSIS # ============================================================================= class ClusteringAnalyzer: """Advanced clustering analysis""" def __init__(self, features, df, output_dir): self.features = features self.df = df self.output_dir = output_dir self.scaler = StandardScaler() self.features_scaled = None self.pca = None self.features_pca = None self.umap_model = None self.features_umap = None def preprocess_features(self): """Preprocess features""" log_message("\nPreprocessing features...") # Scale self.features_scaled = self.scaler.fit_transform(self.features) log_message(f"Features scaled: {self.features_scaled.shape}") # PCA self.pca = PCA(n_components=PCA_COMPONENTS, random_state=SEED) self.features_pca = self.pca.fit_transform(self.features_scaled) variance_explained = np.sum(self.pca.explained_variance_ratio_) log_message(f"PCA: {PCA_COMPONENTS} components explain {variance_explained:.2%} of variance") # UMAP (if available) if UMAP_AVAILABLE: log_message("Applying UMAP for visualization...") self.umap_model = umap.UMAP( n_components=UMAP_COMPONENTS, n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=SEED ) self.features_umap = self.umap_model.fit_transform(self.features_pca) log_message(f"UMAP completed: {self.features_umap.shape}") else: # Use t-SNE as an alternative log_message("Applying t-SNE for visualization...") tsne = TSNE(n_components=2, random_state=SEED, perplexity=30) self.features_umap = tsne.fit_transform(self.features_pca) log_message(f"t-SNE completed: {self.features_umap.shape}") def find_optimal_clusters(self): """Find optimal number of clusters""" log_message("\nSearching for optimal number of clusters...") metrics = { 'n_clusters': [], 'silhouette': [], 'calinski_harabasz': [], 'davies_bouldin': [] } for n in tqdm(N_CLUSTERS_RANGE, desc="Evaluating clusters"): kmeans = KMeans(n_clusters=n, random_state=SEED, n_init=10) labels = kmeans.fit_predict(self.features_pca) metrics['n_clusters'].append(n) metrics['silhouette'].append(silhouette_score(self.features_pca, labels)) metrics['calinski_harabasz'].append(calinski_harabasz_score(self.features_pca, labels)) metrics['davies_bouldin'].append(davies_bouldin_score(self.features_pca, labels)) # Visualize metrics self.plot_clustering_metrics(metrics) # Find optimal (silhouette score) best_idx = np.argmax(metrics['silhouette']) best_n = metrics['n_clusters'][best_idx] log_message(f"Optimal number of clusters (Silhouette): {best_n}") return best_n, metrics def perform_clustering(self, n_clusters): """Perform clustering with different algorithms""" log_message(f"\nApplying clustering with {n_clusters} clusters...") results = {} # K-Means log_message("Applying K-Means...") kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=20) results['kmeans'] = kmeans.fit_predict(self.features_pca) # Hierarchical Clustering log_message("Applying Hierarchical Clustering...") hierarchical = AgglomerativeClustering(n_clusters=n_clusters) results['hierarchical'] = hierarchical.fit_predict(self.features_pca) # Save results for method, labels in results.items(): self.df[f'cluster_{method}'] = labels log_message(f"Clustering completed: {len(results)} methods") return results def evaluate_clustering(self, labels, method_name): """Evaluate clustering quality""" silhouette = silhouette_score(self.features_pca, labels) calinski = calinski_harabasz_score(self.features_pca, labels) davies = davies_bouldin_score(self.features_pca, labels) log_message(f"\nMetrics for {method_name}:") log_message(f" Silhouette Score: {silhouette:.4f}") log_message(f" Calinski-Harabasz: {calinski:.4f}") log_message(f" Davies-Bouldin: {davies:.4f}") # Compare with true labels if they exist if 'fase_P' in self.df.columns: # Filter NaN valid_mask = ~self.df['fase_P'].isna() if valid_mask.sum() > 0: true_labels = pd.Categorical(self.df.loc[valid_mask, 'fase_P']).codes pred_labels = labels[valid_mask] ari = adjusted_rand_score(true_labels, pred_labels) nmi = normalized_mutual_info_score(true_labels, pred_labels) log_message(f"\n External Validation (vs fase):") log_message(f" Adjusted Rand Index: {ari:.4f}") log_message(f" Normalized Mutual Info: {nmi:.4f}") return { 'silhouette': silhouette, 'calinski_harabasz': calinski, 'davies_bouldin': davies } def plot_clustering_metrics(self, metrics): """Visualize clustering metrics""" fig, axes = plt.subplots(2, 2, figsize=(15, 12)) fig.suptitle('Clustering Evaluation Metrics', fontsize=16, fontweight='bold') # Silhouette Score axes[0, 0].plot(metrics['n_clusters'], metrics['silhouette'], 'b-o', linewidth=2) axes[0, 0].set_xlabel('NNumber of Clusters') axes[0, 0].set_ylabel('Silhouette Score') axes[0, 0].set_title('Silhouette Score (higher is better)') axes[0, 0].grid(True, alpha=0.3) # Calinski-Harabasz axes[0, 1].plot(metrics['n_clusters'], metrics['calinski_harabasz'], 'g-o', linewidth=2) axes[0, 1].set_xlabel('Number of Clusters') axes[0, 1].set_ylabel('Calinski-Harabasz Score') axes[0, 1].set_title('Calinski-Harabasz Score (higher is better)') axes[0, 1].grid(True, alpha=0.3) # Davies-Bouldin axes[1, 0].plot(metrics['n_clusters'], metrics['davies_bouldin'], 'r-o', linewidth=2) axes[1, 0].set_xlabel('Number of Clusters') axes[1, 0].set_ylabel('Davies-Bouldin Score') axes[1, 0].set_title('Davies-Bouldin Score (lower is better)') axes[1, 0].grid(True, alpha=0.3) # Summary axes[1, 1].axis('off') best_silhouette = metrics['n_clusters'][np.argmax(metrics['silhouette'])] best_calinski = metrics['n_clusters'][np.argmax(metrics['calinski_harabasz'])] best_davies = metrics['n_clusters'][np.argmin(metrics['davies_bouldin'])] summary_text = f""" Optimal Number of Clusters: Silhouette Score: {best_silhouette} Calinski-Harabasz: {best_calinski} Davies-Bouldin: {best_davies} Recomendación: {best_silhouette} clusters (basado en Silhouette Score) """ axes[1, 1].text(0.1, 0.5, summary_text, fontsize=12, verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) plt.tight_layout() plt.savefig(os.path.join(self.output_dir, 'clustering_metrics.png'), dpi=300, bbox_inches='tight') log_message(f" saved: clustering_metrics.png") plt.close() def visualize_clusters(self, labels, method_name, title_suffix=""): """Visualize clusters in 2D space""" if self.features_umap is None: log_message("⚠️ No reduced features available for visualization", level='WARNING') return fig, axes = plt.subplots(1, 2, figsize=(20, 8)) fig.suptitle(f'Clusters Visualization - {method_name} {title_suffix}', fontsize=16, fontweight='bold') # Plot 1: Clusters scatter = axes[0].scatter( self.features_umap[:, 0], self.features_umap[:, 1], c=labels, cmap='tab20', s=50, alpha=0.6, edgecolors='black', linewidth=0.5 ) axes[0].set_xlabel('UMAP/t-SNE Component 1', fontsize=12) axes[0].set_ylabel('UMAP/t-SNE Component 2', fontsize=12) axes[0].set_title('Clusters Found', fontsize=14) axes[0].grid(True, alpha=0.3) plt.colorbar(scatter, ax=axes[0], label='Cluster ID') # Plot 2: Real phases (if available) if 'fase_P' in self.df.columns: fase_p_codes = pd.Categorical(self.df['fase_P']).codes scatter2 = axes[1].scatter( self.features_umap[:, 0], self.features_umap[:, 1], c=fase_p_codes, cmap='viridis', s=50, alpha=0.6, edgecolors='black', linewidth=0.5 ) axes[1].set_xlabel('UMAP/t-SNE Component 1', fontsize=12) axes[1].set_ylabel('UMAP/t-SNE Component 2', fontsize=12) axes[1].set_title('Real Phases (Phase V)', fontsize=14) axes[1].grid(True, alpha=0.3) plt.colorbar(scatter2, ax=axes[1], label='Phase V') else: axes[1].axis('off') axes[1].text(0.5, 0.5, 'No real labels available', ha='center', va='center', fontsize=14) plt.tight_layout() filename = f'clusters_visualization_{method_name}.png' plt.savefig(os.path.join(self.output_dir, filename), dpi=300, bbox_inches='tight') log_message(f"Saved visualization: {filename}") plt.close() def plot_dendrogram(self): """Plot hierarchical dendrogram""" log_message("\n Generating dendrogram...") # Calculate linkage linkage_matrix = linkage(self.features_pca, method='ward') plt.figure(figsize=(20, 10)) dendrogram( linkage_matrix, truncate_mode='lastp', p=30, leaf_rotation=90, leaf_font_size=10, show_contracted=True ) plt.title('Hierarchical Clustering Dendrogram', fontsize=16, fontweight='bold') plt.xlabel('Sample Index', fontsize=12) plt.ylabel('Distance', fontsize=12) plt.tight_layout() plt.savefig(os.path.join(self.output_dir, 'dendrogram.png'), dpi=300, bbox_inches='tight') log_message(f"Dendrogram saved: dendrogram.png") plt.close() def analyze_cluster_composition(self, labels, method_name): """Analyze cluster composition vs real phases""" if 'fase_P' not in self.df.columns: return log_message(f"\nAnalyzing cluster composition ({method_name})...") # Create contingency table contingency = pd.crosstab( labels, self.df['fase_P'], margins=True ) # Guardar tabla contingency.to_csv( os.path.join(self.output_dir, f'cluster_composition_{method_name}.csv') ) # Visualizar heatmap plt.figure(figsize=(14, 10)) sns.heatmap( contingency.iloc[:-1, :-1], # Sin márgenes annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Number of Images'} ) plt.title(f'Cluster Composition vs Phase V - {method_name}', fontsize=14, fontweight='bold') plt.xlabel('Phase V (Real)', fontsize=12) plt.ylabel('Cluster ID', fontsize=12) plt.tight_layout() plt.savefig( os.path.join(self.output_dir, f'cluster_heatmap_{method_name}.png'), dpi=300, bbox_inches='tight' ) log_message(f"Heatmap saved: cluster_heatmap_{method_name}.png") plt.close() # ============================================================================= # MAIN PIPELINE # ============================================================================= def main(): """Main clustering pipeline""" print("\n" + "="*80) print("🎓 ADVANCED UNSUPERVISED CLUSTERING - NOCCIOLA") print("="*80 + "\n") # Set seed set_seed(SEED) # 1. LOAD DATA log_message("="*80) log_message("STEP 1: DATA LOADING AND PREPARATION") log_message("="*80) loader = DatasetLoader(CSV_PATH, IMAGES_DIR) df_clean = loader.load_and_clean() if len(df_clean) == 0: log_message("No valid data found", level='ERROR') return train_df, val_df, test_df, full_df = loader.split_data() # 2. EXTRACT FEATURES log_message("\n" + "="*80) log_message("STEP 2: FEATURE EXTRACTION") log_message("="*80) extractor = FeatureExtractor(model_name='resnet50') # Extract for all data features, valid_indices = extractor.extract_features(full_df) # Filter DataFrame for valid indices only df_with_features = full_df.iloc[valid_indices].reset_index(drop=True) # Save features np.save(os.path.join(OUTPUT_DIR, 'features.npy'), features) df_with_features.to_csv(os.path.join(OUTPUT_DIR, 'data_with_features.csv'), index=False) # 3. CLUSTERING AND ANALYSIS log_message("\n" + "="*80) log_message("STEP 3: CLUSTERING AND ANALYSIS") log_message("="*80) analyzer = ClusteringAnalyzer(features, df_with_features, OUTPUT_DIR) analyzer.preprocess_features() # Find optimal number of clusters optimal_n, metrics = analyzer.find_optimal_clusters() # Clustering with optimal number clustering_results = analyzer.perform_clustering(optimal_n) # 4. EVALUATION AND VISUALIZATION log_message("\n" + "="*80) log_message("STEP 4: EVALUATION AND VISUALIZATION") log_message("="*80) # Dendrogram analyzer.plot_dendrogram() # Evaluate each method evaluation_results = {} for method, labels in clustering_results.items(): metrics = analyzer.evaluate_clustering(labels, method) evaluation_results[method] = metrics # Visualizar analyzer.visualize_clusters(labels, method) analyzer.analyze_cluster_composition(labels, method) # 5. SAVE FINAL RESULTS log_message("\n" + "="*80) log_message("STEP 5: SAVE FINAL RESULTS") log_message("="*80) # Final CSV with all information output_df = df_with_features[[ 'image_name', 'fase_P', 'split', 'cluster_kmeans', 'cluster_hierarchical' ]].copy() # Rename for clarity output_df = output_df.rename(columns={ 'image_name': 'image', 'fase_P': 'phase_P', 'cluster_kmeans': 'cluster_kmeans', 'cluster_hierarchical': 'cluster_hierarchical' }) output_csv_path = os.path.join(OUTPUT_DIR, 'clustering_results.csv') output_df.to_csv(output_csv_path, index=False) log_message(f"Results saved: {output_csv_path}") # Save summary JSON summary = { 'execution_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'total_images': len(df_with_features), 'num_clusters_optimal': int(optimal_n), 'clustering_methods': list(clustering_results.keys()), 'evaluation': {k: {kk: float(vv) for kk, vv in v.items()} for k, v in evaluation_results.items()}, 'split_distribution': { 'train': int((df_with_features['split'] == 'train').sum()), 'val': int((df_with_features['split'] == 'val').sum()), 'test': int((df_with_features['split'] == 'test').sum()) } } with open(os.path.join(OUTPUT_DIR, 'summary_clustering.json'), 'w') as f: json.dump(summary, f, indent=2) # 6. FINAL SUMMARY print("\n" + "="*80) print("PIPELINE COMPLETADO EXITOSAMENTE") print("="*80) print(f"\nResults saved in: {OUTPUT_DIR}") print(f"\nSummary:") print(f" - Images processed: {len(df_with_features)}") print(f" - Clusters found: {optimal_n}") print(f" - Train: {summary['split_distribution']['train']}") print(f" - Val: {summary['split_distribution']['val']}") print(f" - Test: {summary['split_distribution']['test']}") print(f"\nFiles generated:") print(f" - clustering_results.csv (Main CSV)") print(f" - summary_clustering.json (Detailed summary)") print(f" - features.npy (Extracted features)") print(f" - clustering_metrics.png (Evaluation metrics)") print(f" - clusters_visualization_*.png (Visualizations)") print(f" - dendrogram.png (Hierarchical dendrogram)") print(f" - cluster_heatmap_*.png (Cluster composition)") print("="*80 + "\n") if __name__ == '__main__': main()