775 lines
28 KiB
Python
775 lines
28 KiB
Python
"""
|
|
Clustering Non supervised for Hazelnut (Nocciola)
|
|
|
|
Pipeline complete:
|
|
1. Feature extraction with ResNet50/EfficientNet
|
|
2. Dimensionality reduction (PCA + UMAP)
|
|
3. Hierarchical and optimized K-Means clustering
|
|
4. Advanced visualization and results analysis
|
|
5. Comparison with real labels (external validation)
|
|
"""
|
|
|
|
import os
|
|
import random
|
|
import warnings
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from tqdm import tqdm
|
|
|
|
# Deep Learning
|
|
import tensorflow as tf
|
|
from tensorflow.keras.applications import ResNet50, EfficientNetB0
|
|
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
|
|
from tensorflow.keras.applications.efficientnet import preprocess_input as efficient_preprocess
|
|
from tensorflow.keras.preprocessing import image
|
|
|
|
# Machine Learning
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
|
|
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
|
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
|
|
from sklearn.manifold import TSNE
|
|
|
|
# UMAP for better visualization
|
|
try:
|
|
import umap
|
|
UMAP_AVAILABLE = True
|
|
except ImportError:
|
|
print(" Not available UMAP. Install it with: pip install umap-learn")
|
|
UMAP_AVAILABLE = False
|
|
|
|
# Hierarchy clustering
|
|
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
from scipy.spatial.distance import pdist
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
# =============================================================================
|
|
# CONFIGURACIÓN
|
|
# =============================================================================
|
|
|
|
# Roots
|
|
CSV_PATH = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\tags.csv'
|
|
IMAGES_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow'
|
|
OUTPUT_DIR = r'C:\Users\sof12\Desktop\ML\Datasets\Carciofo\RoboFlow\results_clustering_avanzado_8C'
|
|
|
|
# Processing parameters
|
|
IMG_SIZE = (224, 224)
|
|
BATCH_SIZE = 32
|
|
SEED = 42
|
|
|
|
# Data splitting parameters
|
|
SPLIT_RATIO = {
|
|
'train': 0.70,
|
|
'val': 0.15,
|
|
'test': 0.15
|
|
}
|
|
|
|
# Clustering parameters
|
|
N_CLUSTERS_RANGE = range(8, 15) # Test from 4 to 14 clusters
|
|
PCA_COMPONENTS = 50 # PCA components for analysis
|
|
UMAP_COMPONENTS = 2 # For visualization
|
|
|
|
# Create output directory
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
# =============================================================================
|
|
# UTILITIES
|
|
# =============================================================================
|
|
|
|
def set_seed(seed=42):
|
|
"""Set seed for reproducibility"""
|
|
random.seed(seed)
|
|
np.random.seed(seed)
|
|
tf.random.set_seed(seed)
|
|
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
print(f"Seed set: {seed}")
|
|
|
|
def log_message(message, level='INFO'):
|
|
"""Logging with timestamp"""
|
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
print(f"[{timestamp}] [{level}] {message}")
|
|
|
|
def safe_read_csv(path):
|
|
"""Read CSV with encoding handling"""
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f'CSV not found: {path}')
|
|
|
|
for encoding in ['utf-8', 'latin-1', 'cp1252']:
|
|
try:
|
|
df = pd.read_csv(path, encoding=encoding)
|
|
log_message(f"CSV read successfully with encoding: {encoding}")
|
|
return df
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
raise ValueError("Could not read CSV with any encoding")
|
|
|
|
def find_image_path(images_dir, img_name):
|
|
"""Find full path of an image"""
|
|
if pd.isna(img_name) or str(img_name).strip() == '':
|
|
return None
|
|
|
|
img_name = str(img_name).strip()
|
|
|
|
for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
|
|
# Without extension
|
|
img_path = os.path.join(images_dir, os.path.splitext(img_name)[0] + ext)
|
|
if os.path.exists(img_path):
|
|
return img_path
|
|
|
|
# With direct name
|
|
img_path = os.path.join(images_dir, img_name)
|
|
if os.path.exists(img_path):
|
|
return img_path
|
|
|
|
# Search recursively in subdirectories
|
|
for root, dirs, files in os.walk(images_dir):
|
|
for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
|
|
img_path = os.path.join(root, os.path.splitext(img_name)[0] + ext)
|
|
if os.path.exists(img_path):
|
|
return img_path
|
|
|
|
return None
|
|
|
|
# =============================================================================
|
|
# LOAD AND PREPARE DATA
|
|
# =============================================================================
|
|
|
|
class DatasetLoader:
|
|
"""Class to load and prepare the dataset"""
|
|
|
|
def __init__(self, csv_path, images_dir):
|
|
self.csv_path = csv_path
|
|
self.images_dir = images_dir
|
|
self.df = None
|
|
self.df_clean = None
|
|
|
|
def load_and_clean(self):
|
|
"""Load and clean data"""
|
|
log_message("Loading dataset...")
|
|
|
|
# Read CSV
|
|
self.df = safe_read_csv(self.csv_path)
|
|
log_message(f"Original dataset: {len(self.df)} rows")
|
|
|
|
# Show available columns
|
|
log_message(f"Available columns: {list(self.df.columns)}")
|
|
|
|
# Check required columns
|
|
required_cols = ['fase']
|
|
available_cols = []
|
|
|
|
for col in self.df.columns:
|
|
if 'fase' in col.lower():
|
|
available_cols.append(('fase_P', col))
|
|
|
|
log_message(f"Phases columns detected: {available_cols}")
|
|
|
|
# Identify image column
|
|
img_col = None
|
|
for col in ['id_img', 'imagen', 'image', 'filename', 'file']:
|
|
if col in self.df.columns:
|
|
img_col = col
|
|
break
|
|
|
|
if img_col is None:
|
|
# Try with the first column that looks like file names
|
|
for col in self.df.columns:
|
|
if self.df[col].dtype == 'object':
|
|
sample = str(self.df[col].iloc[0])
|
|
if any(ext in sample.lower() for ext in ['.jpg', '.png', '.jpeg']):
|
|
img_col = col
|
|
break
|
|
|
|
if img_col is None:
|
|
raise ValueError("Could not identify the image column")
|
|
|
|
log_message(f"Image column: {img_col}")
|
|
|
|
# Filter valid rows
|
|
valid_rows = []
|
|
for idx, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Verifying images"):
|
|
img_path = find_image_path(self.images_dir, row[img_col])
|
|
if img_path:
|
|
valid_row = {
|
|
'image_name': os.path.basename(img_path),
|
|
'image_path': img_path,
|
|
'fase_P': row.get('fase', np.nan)
|
|
}
|
|
valid_rows.append(valid_row)
|
|
|
|
self.df_clean = pd.DataFrame(valid_rows)
|
|
log_message(f"Valid images found: {len(self.df_clean)}")
|
|
|
|
# Distribution analysis
|
|
self.analyze_distribution()
|
|
|
|
return self.df_clean
|
|
|
|
def analyze_distribution(self):
|
|
"""Analyze phase distribution"""
|
|
log_message("\n=== Phase Distribution Analysis ===")
|
|
|
|
if 'fase_P' in self.df_clean.columns:
|
|
fase_p_counts = self.df_clean['fase_P'].value_counts()
|
|
log_message(f"\nfase ({len(fase_p_counts)} classes):")
|
|
for fase, count in fase_p_counts.items():
|
|
print(f" {fase}: {count} images")
|
|
|
|
def split_data(self, split_ratio=SPLIT_RATIO):
|
|
"""Dividing data in train/val/test"""
|
|
log_message("\n🔀 Dividing data...")
|
|
|
|
# Shuffle data
|
|
df_shuffled = self.df_clean.sample(frac=1, random_state=SEED).reset_index(drop=True)
|
|
|
|
n = len(df_shuffled)
|
|
n_train = int(n * split_ratio['train'])
|
|
n_val = int(n * split_ratio['val'])
|
|
|
|
train_df = df_shuffled.iloc[:n_train].copy()
|
|
val_df = df_shuffled.iloc[n_train:n_train + n_val].copy()
|
|
test_df = df_shuffled.iloc[n_train + n_val:].copy()
|
|
|
|
train_df['split'] = 'train'
|
|
val_df['split'] = 'val'
|
|
test_df['split'] = 'test'
|
|
|
|
log_message(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
|
|
|
|
# Guardar splits
|
|
split_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
|
|
split_df.to_csv(os.path.join(OUTPUT_DIR, 'data_splits.csv'), index=False)
|
|
|
|
return train_df, val_df, test_df, split_df
|
|
|
|
# =============================================================================
|
|
# FEATURE EXTRACTION
|
|
# =============================================================================
|
|
|
|
class FeatureExtractor:
|
|
"""Feature extractor using pre-trained CNNs"""
|
|
|
|
def __init__(self, model_name='resnet50'):
|
|
self.model_name = model_name
|
|
self.model = None
|
|
self.preprocess_fn = None
|
|
self._build_model()
|
|
|
|
def _build_model(self):
|
|
"""Construct the feature extractor model"""
|
|
log_message(f"Building feature extractor: {self.model_name}")
|
|
|
|
if self.model_name.lower() == 'resnet50':
|
|
self.model = ResNet50(
|
|
weights='imagenet',
|
|
include_top=False,
|
|
pooling='avg',
|
|
input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
|
|
)
|
|
self.preprocess_fn = resnet_preprocess
|
|
elif self.model_name.lower() == 'efficientnet':
|
|
self.model = EfficientNetB0(
|
|
weights='imagenet',
|
|
include_top=False,
|
|
pooling='avg',
|
|
input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
|
|
)
|
|
self.preprocess_fn = efficient_preprocess
|
|
else:
|
|
raise ValueError(f"Not supported model: {self.model_name}")
|
|
|
|
log_message(f"Model built. Output shape: {self.model.output_shape}")
|
|
|
|
def load_and_preprocess_image(self, img_path):
|
|
"""Load and preprocess an image"""
|
|
try:
|
|
img = image.load_img(img_path, target_size=IMG_SIZE)
|
|
img_array = image.img_to_array(img)
|
|
img_array = np.expand_dims(img_array, axis=0)
|
|
img_array = self.preprocess_fn(img_array)
|
|
return img_array
|
|
except Exception as e:
|
|
log_message(f"Error loading image {img_path}: {e}", level='ERROR')
|
|
return None
|
|
|
|
def extract_features(self, df):
|
|
"""Extract features from all images"""
|
|
log_message(f"\nExtracting features from {len(df)} images...")
|
|
|
|
features_list = []
|
|
valid_indices = []
|
|
|
|
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
|
|
img_array = self.load_and_preprocess_image(row['image_path'])
|
|
|
|
if img_array is not None:
|
|
features = self.model.predict(img_array, verbose=0)
|
|
features_list.append(features.flatten())
|
|
valid_indices.append(idx)
|
|
|
|
features_array = np.array(features_list)
|
|
log_message(f"Features extracted: {features_array.shape}")
|
|
|
|
return features_array, valid_indices
|
|
|
|
# =============================================================================
|
|
# CLUSTERING AND ANALYSIS
|
|
# =============================================================================
|
|
|
|
class ClusteringAnalyzer:
|
|
"""Advanced clustering analysis"""
|
|
|
|
def __init__(self, features, df, output_dir):
|
|
self.features = features
|
|
self.df = df
|
|
self.output_dir = output_dir
|
|
self.scaler = StandardScaler()
|
|
self.features_scaled = None
|
|
self.pca = None
|
|
self.features_pca = None
|
|
self.umap_model = None
|
|
self.features_umap = None
|
|
|
|
def preprocess_features(self):
|
|
"""Preprocess features"""
|
|
log_message("\nPreprocessing features...")
|
|
|
|
# Scale
|
|
self.features_scaled = self.scaler.fit_transform(self.features)
|
|
log_message(f"Features scaled: {self.features_scaled.shape}")
|
|
|
|
# PCA
|
|
self.pca = PCA(n_components=PCA_COMPONENTS, random_state=SEED)
|
|
self.features_pca = self.pca.fit_transform(self.features_scaled)
|
|
|
|
variance_explained = np.sum(self.pca.explained_variance_ratio_)
|
|
log_message(f"PCA: {PCA_COMPONENTS} components explain {variance_explained:.2%} of variance")
|
|
|
|
# UMAP (if available)
|
|
if UMAP_AVAILABLE:
|
|
log_message("Applying UMAP for visualization...")
|
|
self.umap_model = umap.UMAP(
|
|
n_components=UMAP_COMPONENTS,
|
|
n_neighbors=15,
|
|
min_dist=0.1,
|
|
metric='euclidean',
|
|
random_state=SEED
|
|
)
|
|
self.features_umap = self.umap_model.fit_transform(self.features_pca)
|
|
log_message(f"UMAP completed: {self.features_umap.shape}")
|
|
else:
|
|
# Use t-SNE as an alternative
|
|
log_message("Applying t-SNE for visualization...")
|
|
tsne = TSNE(n_components=2, random_state=SEED, perplexity=30)
|
|
self.features_umap = tsne.fit_transform(self.features_pca)
|
|
log_message(f"t-SNE completed: {self.features_umap.shape}")
|
|
|
|
def find_optimal_clusters(self):
|
|
"""Find optimal number of clusters"""
|
|
log_message("\nSearching for optimal number of clusters...")
|
|
|
|
metrics = {
|
|
'n_clusters': [],
|
|
'silhouette': [],
|
|
'calinski_harabasz': [],
|
|
'davies_bouldin': []
|
|
}
|
|
|
|
for n in tqdm(N_CLUSTERS_RANGE, desc="Evaluating clusters"):
|
|
kmeans = KMeans(n_clusters=n, random_state=SEED, n_init=10)
|
|
labels = kmeans.fit_predict(self.features_pca)
|
|
|
|
metrics['n_clusters'].append(n)
|
|
metrics['silhouette'].append(silhouette_score(self.features_pca, labels))
|
|
metrics['calinski_harabasz'].append(calinski_harabasz_score(self.features_pca, labels))
|
|
metrics['davies_bouldin'].append(davies_bouldin_score(self.features_pca, labels))
|
|
|
|
# Visualize metrics
|
|
self.plot_clustering_metrics(metrics)
|
|
|
|
# Find optimal (silhouette score)
|
|
best_idx = np.argmax(metrics['silhouette'])
|
|
best_n = metrics['n_clusters'][best_idx]
|
|
|
|
log_message(f"Optimal number of clusters (Silhouette): {best_n}")
|
|
|
|
return best_n, metrics
|
|
|
|
def perform_clustering(self, n_clusters):
|
|
"""Perform clustering with different algorithms"""
|
|
log_message(f"\nApplying clustering with {n_clusters} clusters...")
|
|
|
|
results = {}
|
|
|
|
# K-Means
|
|
log_message("Applying K-Means...")
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=20)
|
|
results['kmeans'] = kmeans.fit_predict(self.features_pca)
|
|
|
|
# Hierarchical Clustering
|
|
log_message("Applying Hierarchical Clustering...")
|
|
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
|
|
results['hierarchical'] = hierarchical.fit_predict(self.features_pca)
|
|
|
|
# Save results
|
|
for method, labels in results.items():
|
|
self.df[f'cluster_{method}'] = labels
|
|
|
|
log_message(f"Clustering completed: {len(results)} methods")
|
|
|
|
return results
|
|
|
|
def evaluate_clustering(self, labels, method_name):
|
|
"""Evaluate clustering quality"""
|
|
silhouette = silhouette_score(self.features_pca, labels)
|
|
calinski = calinski_harabasz_score(self.features_pca, labels)
|
|
davies = davies_bouldin_score(self.features_pca, labels)
|
|
|
|
log_message(f"\nMetrics for {method_name}:")
|
|
log_message(f" Silhouette Score: {silhouette:.4f}")
|
|
log_message(f" Calinski-Harabasz: {calinski:.4f}")
|
|
log_message(f" Davies-Bouldin: {davies:.4f}")
|
|
|
|
# Compare with true labels if they exist
|
|
if 'fase_P' in self.df.columns:
|
|
# Filter NaN
|
|
valid_mask = ~self.df['fase_P'].isna()
|
|
if valid_mask.sum() > 0:
|
|
true_labels = pd.Categorical(self.df.loc[valid_mask, 'fase_P']).codes
|
|
pred_labels = labels[valid_mask]
|
|
|
|
ari = adjusted_rand_score(true_labels, pred_labels)
|
|
nmi = normalized_mutual_info_score(true_labels, pred_labels)
|
|
|
|
log_message(f"\n External Validation (vs fase):")
|
|
log_message(f" Adjusted Rand Index: {ari:.4f}")
|
|
log_message(f" Normalized Mutual Info: {nmi:.4f}")
|
|
|
|
return {
|
|
'silhouette': silhouette,
|
|
'calinski_harabasz': calinski,
|
|
'davies_bouldin': davies
|
|
}
|
|
|
|
def plot_clustering_metrics(self, metrics):
|
|
"""Visualize clustering metrics"""
|
|
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
fig.suptitle('Clustering Evaluation Metrics', fontsize=16, fontweight='bold')
|
|
|
|
# Silhouette Score
|
|
axes[0, 0].plot(metrics['n_clusters'], metrics['silhouette'], 'b-o', linewidth=2)
|
|
axes[0, 0].set_xlabel('NNumber of Clusters')
|
|
axes[0, 0].set_ylabel('Silhouette Score')
|
|
axes[0, 0].set_title('Silhouette Score (higher is better)')
|
|
axes[0, 0].grid(True, alpha=0.3)
|
|
|
|
# Calinski-Harabasz
|
|
axes[0, 1].plot(metrics['n_clusters'], metrics['calinski_harabasz'], 'g-o', linewidth=2)
|
|
axes[0, 1].set_xlabel('Number of Clusters')
|
|
axes[0, 1].set_ylabel('Calinski-Harabasz Score')
|
|
axes[0, 1].set_title('Calinski-Harabasz Score (higher is better)')
|
|
axes[0, 1].grid(True, alpha=0.3)
|
|
|
|
# Davies-Bouldin
|
|
axes[1, 0].plot(metrics['n_clusters'], metrics['davies_bouldin'], 'r-o', linewidth=2)
|
|
axes[1, 0].set_xlabel('Number of Clusters')
|
|
axes[1, 0].set_ylabel('Davies-Bouldin Score')
|
|
axes[1, 0].set_title('Davies-Bouldin Score (lower is better)')
|
|
axes[1, 0].grid(True, alpha=0.3)
|
|
|
|
# Summary
|
|
axes[1, 1].axis('off')
|
|
best_silhouette = metrics['n_clusters'][np.argmax(metrics['silhouette'])]
|
|
best_calinski = metrics['n_clusters'][np.argmax(metrics['calinski_harabasz'])]
|
|
best_davies = metrics['n_clusters'][np.argmin(metrics['davies_bouldin'])]
|
|
|
|
summary_text = f"""
|
|
Optimal Number of Clusters:
|
|
|
|
Silhouette Score: {best_silhouette}
|
|
Calinski-Harabasz: {best_calinski}
|
|
Davies-Bouldin: {best_davies}
|
|
|
|
Recomendación: {best_silhouette} clusters
|
|
(basado en Silhouette Score)
|
|
"""
|
|
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=12, verticalalignment='center',
|
|
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(os.path.join(self.output_dir, 'clustering_metrics.png'), dpi=300, bbox_inches='tight')
|
|
log_message(f" saved: clustering_metrics.png")
|
|
plt.close()
|
|
|
|
def visualize_clusters(self, labels, method_name, title_suffix=""):
|
|
"""Visualize clusters in 2D space"""
|
|
if self.features_umap is None:
|
|
log_message("⚠️ No reduced features available for visualization", level='WARNING')
|
|
return
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
|
|
fig.suptitle(f'Clusters Visualization - {method_name} {title_suffix}',
|
|
fontsize=16, fontweight='bold')
|
|
|
|
# Plot 1: Clusters
|
|
scatter = axes[0].scatter(
|
|
self.features_umap[:, 0],
|
|
self.features_umap[:, 1],
|
|
c=labels,
|
|
cmap='tab20',
|
|
s=50,
|
|
alpha=0.6,
|
|
edgecolors='black',
|
|
linewidth=0.5
|
|
)
|
|
axes[0].set_xlabel('UMAP/t-SNE Component 1', fontsize=12)
|
|
axes[0].set_ylabel('UMAP/t-SNE Component 2', fontsize=12)
|
|
axes[0].set_title('Clusters Found', fontsize=14)
|
|
axes[0].grid(True, alpha=0.3)
|
|
plt.colorbar(scatter, ax=axes[0], label='Cluster ID')
|
|
|
|
# Plot 2: Real phases (if available)
|
|
if 'fase_P' in self.df.columns:
|
|
fase_p_codes = pd.Categorical(self.df['fase_P']).codes
|
|
scatter2 = axes[1].scatter(
|
|
self.features_umap[:, 0],
|
|
self.features_umap[:, 1],
|
|
c=fase_p_codes,
|
|
cmap='viridis',
|
|
s=50,
|
|
alpha=0.6,
|
|
edgecolors='black',
|
|
linewidth=0.5
|
|
)
|
|
axes[1].set_xlabel('UMAP/t-SNE Component 1', fontsize=12)
|
|
axes[1].set_ylabel('UMAP/t-SNE Component 2', fontsize=12)
|
|
axes[1].set_title('Real Phases (Phase V)', fontsize=14)
|
|
axes[1].grid(True, alpha=0.3)
|
|
plt.colorbar(scatter2, ax=axes[1], label='Phase V')
|
|
else:
|
|
axes[1].axis('off')
|
|
axes[1].text(0.5, 0.5, 'No real labels available',
|
|
ha='center', va='center', fontsize=14)
|
|
|
|
plt.tight_layout()
|
|
filename = f'clusters_visualization_{method_name}.png'
|
|
plt.savefig(os.path.join(self.output_dir, filename), dpi=300, bbox_inches='tight')
|
|
log_message(f"Saved visualization: {filename}")
|
|
plt.close()
|
|
|
|
def plot_dendrogram(self):
|
|
"""Plot hierarchical dendrogram"""
|
|
log_message("\n Generating dendrogram...")
|
|
|
|
# Calculate linkage
|
|
linkage_matrix = linkage(self.features_pca, method='ward')
|
|
|
|
plt.figure(figsize=(20, 10))
|
|
dendrogram(
|
|
linkage_matrix,
|
|
truncate_mode='lastp',
|
|
p=30,
|
|
leaf_rotation=90,
|
|
leaf_font_size=10,
|
|
show_contracted=True
|
|
)
|
|
plt.title('Hierarchical Clustering Dendrogram', fontsize=16, fontweight='bold')
|
|
plt.xlabel('Sample Index', fontsize=12)
|
|
plt.ylabel('Distance', fontsize=12)
|
|
plt.tight_layout()
|
|
plt.savefig(os.path.join(self.output_dir, 'dendrogram.png'), dpi=300, bbox_inches='tight')
|
|
log_message(f"Dendrogram saved: dendrogram.png")
|
|
plt.close()
|
|
|
|
def analyze_cluster_composition(self, labels, method_name):
|
|
"""Analyze cluster composition vs real phases"""
|
|
if 'fase_P' not in self.df.columns:
|
|
return
|
|
|
|
log_message(f"\nAnalyzing cluster composition ({method_name})...")
|
|
|
|
# Create contingency table
|
|
contingency = pd.crosstab(
|
|
labels,
|
|
self.df['fase_P'],
|
|
margins=True
|
|
)
|
|
|
|
# Guardar tabla
|
|
contingency.to_csv(
|
|
os.path.join(self.output_dir, f'cluster_composition_{method_name}.csv')
|
|
)
|
|
|
|
# Visualizar heatmap
|
|
plt.figure(figsize=(14, 10))
|
|
sns.heatmap(
|
|
contingency.iloc[:-1, :-1], # Sin márgenes
|
|
annot=True,
|
|
fmt='d',
|
|
cmap='YlOrRd',
|
|
cbar_kws={'label': 'Number of Images'}
|
|
)
|
|
plt.title(f'Cluster Composition vs Phase V - {method_name}',
|
|
fontsize=14, fontweight='bold')
|
|
plt.xlabel('Phase V (Real)', fontsize=12)
|
|
plt.ylabel('Cluster ID', fontsize=12)
|
|
plt.tight_layout()
|
|
plt.savefig(
|
|
os.path.join(self.output_dir, f'cluster_heatmap_{method_name}.png'),
|
|
dpi=300,
|
|
bbox_inches='tight'
|
|
)
|
|
log_message(f"Heatmap saved: cluster_heatmap_{method_name}.png")
|
|
plt.close()
|
|
|
|
# =============================================================================
|
|
# MAIN PIPELINE
|
|
# =============================================================================
|
|
|
|
def main():
|
|
"""Main clustering pipeline"""
|
|
|
|
print("\n" + "="*80)
|
|
print("🎓 ADVANCED UNSUPERVISED CLUSTERING - NOCCIOLA")
|
|
print("="*80 + "\n")
|
|
|
|
# Set seed
|
|
set_seed(SEED)
|
|
|
|
# 1. LOAD DATA
|
|
log_message("="*80)
|
|
log_message("STEP 1: DATA LOADING AND PREPARATION")
|
|
log_message("="*80)
|
|
|
|
loader = DatasetLoader(CSV_PATH, IMAGES_DIR)
|
|
df_clean = loader.load_and_clean()
|
|
|
|
if len(df_clean) == 0:
|
|
log_message("No valid data found", level='ERROR')
|
|
return
|
|
|
|
train_df, val_df, test_df, full_df = loader.split_data()
|
|
|
|
# 2. EXTRACT FEATURES
|
|
log_message("\n" + "="*80)
|
|
log_message("STEP 2: FEATURE EXTRACTION")
|
|
log_message("="*80)
|
|
|
|
extractor = FeatureExtractor(model_name='resnet50')
|
|
|
|
# Extract for all data
|
|
features, valid_indices = extractor.extract_features(full_df)
|
|
|
|
# Filter DataFrame for valid indices only
|
|
df_with_features = full_df.iloc[valid_indices].reset_index(drop=True)
|
|
|
|
# Save features
|
|
np.save(os.path.join(OUTPUT_DIR, 'features.npy'), features)
|
|
df_with_features.to_csv(os.path.join(OUTPUT_DIR, 'data_with_features.csv'), index=False)
|
|
|
|
# 3. CLUSTERING AND ANALYSIS
|
|
log_message("\n" + "="*80)
|
|
log_message("STEP 3: CLUSTERING AND ANALYSIS")
|
|
log_message("="*80)
|
|
|
|
analyzer = ClusteringAnalyzer(features, df_with_features, OUTPUT_DIR)
|
|
analyzer.preprocess_features()
|
|
|
|
# Find optimal number of clusters
|
|
optimal_n, metrics = analyzer.find_optimal_clusters()
|
|
|
|
# Clustering with optimal number
|
|
clustering_results = analyzer.perform_clustering(optimal_n)
|
|
|
|
# 4. EVALUATION AND VISUALIZATION
|
|
log_message("\n" + "="*80)
|
|
log_message("STEP 4: EVALUATION AND VISUALIZATION")
|
|
log_message("="*80)
|
|
|
|
# Dendrogram
|
|
analyzer.plot_dendrogram()
|
|
|
|
# Evaluate each method
|
|
evaluation_results = {}
|
|
for method, labels in clustering_results.items():
|
|
metrics = analyzer.evaluate_clustering(labels, method)
|
|
evaluation_results[method] = metrics
|
|
|
|
# Visualizar
|
|
analyzer.visualize_clusters(labels, method)
|
|
analyzer.analyze_cluster_composition(labels, method)
|
|
|
|
# 5. SAVE FINAL RESULTS
|
|
log_message("\n" + "="*80)
|
|
log_message("STEP 5: SAVE FINAL RESULTS")
|
|
log_message("="*80)
|
|
|
|
# Final CSV with all information
|
|
output_df = df_with_features[[
|
|
'image_name', 'fase_P', 'split',
|
|
'cluster_kmeans', 'cluster_hierarchical'
|
|
]].copy()
|
|
|
|
# Rename for clarity
|
|
output_df = output_df.rename(columns={
|
|
'image_name': 'image',
|
|
'fase_P': 'phase_P',
|
|
'cluster_kmeans': 'cluster_kmeans',
|
|
'cluster_hierarchical': 'cluster_hierarchical'
|
|
})
|
|
|
|
output_csv_path = os.path.join(OUTPUT_DIR, 'clustering_results.csv')
|
|
output_df.to_csv(output_csv_path, index=False)
|
|
log_message(f"Results saved: {output_csv_path}")
|
|
|
|
# Save summary JSON
|
|
summary = {
|
|
'execution_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
'total_images': len(df_with_features),
|
|
'num_clusters_optimal': int(optimal_n),
|
|
'clustering_methods': list(clustering_results.keys()),
|
|
'evaluation': {k: {kk: float(vv) for kk, vv in v.items()}
|
|
for k, v in evaluation_results.items()},
|
|
'split_distribution': {
|
|
'train': int((df_with_features['split'] == 'train').sum()),
|
|
'val': int((df_with_features['split'] == 'val').sum()),
|
|
'test': int((df_with_features['split'] == 'test').sum())
|
|
}
|
|
}
|
|
|
|
with open(os.path.join(OUTPUT_DIR, 'summary_clustering.json'), 'w') as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
# 6. FINAL SUMMARY
|
|
print("\n" + "="*80)
|
|
print("PIPELINE COMPLETADO EXITOSAMENTE")
|
|
print("="*80)
|
|
print(f"\nResults saved in: {OUTPUT_DIR}")
|
|
print(f"\nSummary:")
|
|
print(f" - Images processed: {len(df_with_features)}")
|
|
print(f" - Clusters found: {optimal_n}")
|
|
print(f" - Train: {summary['split_distribution']['train']}")
|
|
print(f" - Val: {summary['split_distribution']['val']}")
|
|
print(f" - Test: {summary['split_distribution']['test']}")
|
|
print(f"\nFiles generated:")
|
|
print(f" - clustering_results.csv (Main CSV)")
|
|
print(f" - summary_clustering.json (Detailed summary)")
|
|
print(f" - features.npy (Extracted features)")
|
|
print(f" - clustering_metrics.png (Evaluation metrics)")
|
|
print(f" - clusters_visualization_*.png (Visualizations)")
|
|
print(f" - dendrogram.png (Hierarchical dendrogram)")
|
|
print(f" - cluster_heatmap_*.png (Cluster composition)")
|
|
print("="*80 + "\n")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|