#!/usr/bin/env python3 """ Agricultural Phenology Project - CNN Training Pipeline Description: This script implements a complete machine learning pipeline for phenological phase classification in crops using Convolutional Neural Networks (CNN). The system is designed to automatically identify different growth stages of plants from images. MAIN FUNCTIONALITIES: - Automatic data split preparation (train/val/test) - CNN model training (MobileNetV2 by default) - Data augmentation to improve model robustness - Comprehensive evaluation with metrics and visualizations - Automatic saving of models and reports Usage: python train_pipeline_vscode.py --images_dir ./Datasets/Artichoke/Artichoke_1 --csv ./Datasets/Artichoke/Artichoke_1.csv --out_dir ./Datasets/results --model mobilenet --epochs 15 Requirements: - Images must be named as .jpg - CSV file must contain 'id_img' and 'fase' columns AUTHOR: Sofia Garcia Arcila DATE: August 2025 VERSION: 1.0 """ # ============================================================================= # LIBRARY IMPORTS # ============================================================================= # System and file handling libraries import os # Operating system interface (paths, directories) import shutil # High-level file operations (copy, move, delete) import argparse # Command-line argument parsing import random # Random number generation for reproducibility import pandas as pd # Data manipulation and analysis import numpy as np # Numerical computing with multidimensional arrays from pathlib import Path # Modern path handling # Data visualization libraries import matplotlib.pyplot as plt # Plotting and visualization import seaborn as sns # Statistical data visualization # Deep learning libraries(TensorFlow and Keras) import tensorflow as tf # Deep learning framework from tensorflow.keras import layers, models # Neural network architecture building from tensorflow.keras.applications import MobileNetV2 # Pre-trained model optimized for mobile devices from tensorflow.keras.preprocessing.image import ImageDataGenerator # Data generator with augmentation capabilities # Traditional machine learning libraries for evaluation from sklearn.metrics import classification_report, confusion_matrix # Evaluation metrics # ============================================================================= # COMMAND-LINE ARGUMENT CONFIGURATION # ============================================================================= def parse_args(): """ Configure and parse command-line arguments. This function allows users to customize all important pipeline parameters without needing to modify the source code. Returns: argparse.Namespace: Object containing all parsed arguments Available arguments: --images_dir: Directory containing all images (format: id_img.jpg) --csv: CSV file with columns 'id_img' and 'fase' --out_dir: Directory where all results will be saved --img_size: Size to which images will be resized (default: 224px) --batch_size: Number of images processed simultaneously (default: 32) --epochs: Maximum number of training epochs (default: 15) --seed: Seed for result reproducibility (default: 42) --model: Type of model to train ('mobilenet' or 'simplecnn') """ p = argparse.ArgumentParser( description="Training pipeline for phenological phase classification", formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show default values in help ) p.add_argument('--images_dir', required=True) p.add_argument('--csv', required=True) p.add_argument('--out_dir', required=True) p.add_argument('--img_size', type=int, default=224) p.add_argument('--batch_size', type=int, default=32) p.add_argument('--epochs', type=int, default=15) p.add_argument('--seed', type=int, default=42) p.add_argument('--model', choices=['mobilenet','simplecnn'], default='mobilenet') return p.parse_args() # ============================================================================= # DATA PROCESSING FUNCTIONS # ============================================================================= def prepare_splits(df, images_dir, out_dir, split={'train':0.7,'val':0.15,'test':0.15}, seed=42): """ Prepare data division into training, validation, and test sets. This function organizes images into a directory structure compatible with Keras ImageDataGenerator, facilitating model training. Args: df (pd.DataFrame): DataFrame with columns 'id_img' and 'fase' images_dir (str): Source directory containing all images out_dir (str): Directory where organized structure will be created split (dict): Proportions for each set (train/val/test) seed (int): Seed for reproducible shuffling Returns: tuple: (train_df, val_df, test_df) - DataFrames for each set Created directory structure: out_dir/ ├── train/ │ ├── phase1/ │ │ ├── image1.jpg │ │ └── image2.jpg │ └── phase2/ │ └── image3.jpg ├── val/ │ └── [same structure] └── test/ └── [same structure] """ print("📂 Preparing data splits...") random.seed(seed) # Set seed for reproducibility df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True) n = len(df_shuffled) n_train = int(n*split['train']) n_val = int(n*split['val']) # Split the DataFrame into three sets train_df = df_shuffled.iloc[:n_train] val_df = df_shuffled.iloc[n_train:n_train+n_val] test_df = df_shuffled.iloc[n_train+n_val:] # Create directory structure for each set an each phase labels = df['fase'].unique() for part in ['train','val','test']: for lbl in labels: os.makedirs(os.path.join(out_dir, part, str(lbl)), exist_ok=True) def copy_subset(subdf, subset_name): """ Auxiliary function to copy images from a specific subset. Args: subdf (pd.DataFrame): DataFrame of the subset (train/val/test) subset_name (str): Dataset name ('train', 'val', 'test') """ for _, row in subdf.iterrows(): src = os.path.join(images_dir, f"{row['id_img']}.jpg") dst = os.path.join(out_dir, subset_name, str(row['fase']), f"{row['id_img']}.jpg") if os.path.exists(src): shutil.copy(src, dst) copy_subset(train_df, 'train') copy_subset(val_df, 'val') copy_subset(test_df, 'test') return train_df, val_df, test_df # ============================================================================= # MODEL BUILDING FUNCTIONS # ============================================================================= def build_model(img_size, n_classes, kind='mobilenet'): """ Build and compile a convolutional neural network model. Args: img_size (int): Size of input images (assumes square images) n_classes (int): Number of classes for classification kind (str): Model type ('mobilenet' for transfer learning, 'simplecnn' for simple CNN) Returns: tf.keras.Model: Compiled model ready for training Available architectures: - 'mobilenet': Transfer learning with MobileNetV2 pre-trained on ImageNet - 'simplecnn': Simple CNN built from scratch """ if kind=='mobilenet': print("🏗️ Building model with Transfer Learning (MobileNetV2)...") base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(img_size,img_size,3)) base_model.trainable = False model = models.Sequential([ base_model, layers.GlobalAveragePooling2D(), layers.Dropout(0.3), layers.Dense(128, activation='relu'), layers.Dropout(0.3), layers.Dense(n_classes, activation='softmax') ]) else: model = models.Sequential([ layers.Conv2D(32,(3,3),activation='relu',input_shape=(img_size,img_size,3)), layers.MaxPooling2D(2,2), layers.Conv2D(64,(3,3),activation='relu'), layers.MaxPooling2D(2,2), layers.Flatten(), layers.Dense(128,activation='relu'), layers.Dense(n_classes,activation='softmax') ]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model # ============================================================================= # VISUALIZATION FUNCTIONS # ============================================================================= def plot_history(history, out_dir): """ Create training history plot (accuracy vs epochs). Args: history (tf.keras.callbacks.History): History returned by model.fit() out_dir (str): Directory where to save the plot Generates: - Plot of training vs validation accuracy per epoch - 'accuracy.png' file saved in out_dir """ print("📊 Generating training history plot...") plt.figure(figsize=(10,6)) plt.plot(history.history['accuracy'], label='Training Accuracy') plt.plot(history.history['val_accuracy'], label='Validation Accuracy') plt.title('Accuracy Evolution during Training', fontsize=14, fontweight='bold') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend(fontsize=12) plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(os.path.join(out_dir,'accuracy.png'), dpi=300, bbox_inches='tight') plt.close() # ============================================================================= # MAIN FUNCTION # ============================================================================= def main(): """ Main function that executes the complete training pipeline. Execution flow: 1. Parse command-line arguments 2. Load and validate data 3. Prepare data splits 4. Configure data generators with augmentation 5. Build and train model 6. Evaluate model on test set 7. Generate reports and visualizations 8. Save trained model and results """ print("🚀 STARTING PHENOLOGICAL CLASSIFICATION TRAINING PIPELINE") print("=" * 80) args = parse_args() IMAGES_DIR = args.images_dir CSV_PATH = args.csv OUT_DIR = args.out_dir os.makedirs(OUT_DIR, exist_ok=True) try: df = pd.read_csv(CSV_PATH) print(f" ✅ CSV loaded successfully with UTF-8 encoding") except UnicodeDecodeError: print(f" ⚠️ Error with UTF-8, trying Latin-1...") df = pd.read_csv(CSV_PATH, encoding='latin-1') print(f" ✅ CSV loaded successfully with Latin-1 encoding") required_cols = {'id_img','fase'} if not required_cols.issubset(set(df.columns)): raise ValueError(f'CSV must contain columns: {required_cols}') split_dir = os.path.join(OUT_DIR,'split_data') if os.path.exists(split_dir): shutil.rmtree(split_dir) train_df, val_df, test_df = prepare_splits(df, IMAGES_DIR, split_dir, seed=args.seed) IMG_SIZE = (args.img_size, args.img_size) BATCH_SIZE = args.batch_size # Generator for training WITH augmentation train_datagen = ImageDataGenerator(rescale=1./255, # Normalization: [0,255] → [0,1] rotation_range=20, # Random rotations width_shift_range=0.1, # Horizontal shift height_shift_range=0.1, # Vertical shift zoom_range=0.1, # Random zoom horizontal_flip=True) # Random horizontal flips # Generator for validation and test WITHOUT augmentation test_datagen = ImageDataGenerator(rescale=1./255) # Create data flow generators train_gen = train_datagen.flow_from_directory(os.path.join(split_dir,'train'), target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='categorical') val_gen = test_datagen.flow_from_directory(os.path.join(split_dir,'val'), target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='categorical') test_gen = test_datagen.flow_from_directory(os.path.join(split_dir,'test'), target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='categorical', shuffle=False) model = build_model(args.img_size, train_gen.num_classes, kind=args.model) model.summary() callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True), tf.keras.callbacks.ModelCheckpoint(os.path.join(OUT_DIR,'best_model.h5'), save_best_only=True)] history = model.fit(train_gen, validation_data=val_gen, epochs=args.epochs, callbacks=callbacks) model.load_weights(os.path.join(OUT_DIR,'best_model.h5')) y_pred_prob = model.predict(test_gen, verbose=1) y_pred = np.argmax(y_pred_prob,axis=1) y_true = test_gen.classes class_labels = list(train_gen.class_indices.keys()) # Get unique labels that actually appear in test set unique_test_labels = np.unique(y_true) actual_labels = [class_labels[i] for i in unique_test_labels] report = classification_report(y_true, y_pred, target_names=actual_labels) cm = confusion_matrix(y_true, y_pred) print('\nClassification Report:\n', report) with open(os.path.join(OUT_DIR,'classification_report.txt'),'w', encoding='utf-8') as f: f.write("CLASSIFICATION REPORT - PHENOLOGY MODEL\n") f.write("=" * 50 + "\n\n") f.write(f"Model: {args.model}\n") f.write(f"Image size: {args.img_size}px\n") f.write(f"Epochs trained: {len(history.history['accuracy'])}\n") f.write(f"Best validation accuracy: {max(history.history['val_accuracy']):.4f}\n\n") f.write(report) np.savetxt(os.path.join(OUT_DIR,'confusion_matrix.csv'), cm, delimiter=',', fmt='%d') # Visualizaciones plot_history(history, OUT_DIR) plt.figure(figsize=(8,6)) sns.heatmap(cm, annot=True, fmt='d', xticklabels=actual_labels, yticklabels=actual_labels, cmap='Blues') plt.xlabel('Predicción') plt.ylabel('Verdadero') plt.title('Matriz de confusión') plt.savefig(os.path.join(OUT_DIR,'confusion_matrix.png')) plt.close() # Guardar modelo final model.save(os.path.join(OUT_DIR,'final_model.h5')) print('Resultados guardados en', OUT_DIR) if __name__=='__main__': main()