Phenology/Code/Phenology_V1.py

365 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Agricultural Phenology Project - CNN Training Pipeline
Description:
This script implements a complete machine learning pipeline for phenological phase
classification in crops using Convolutional Neural Networks (CNN). The system is
designed to automatically identify different growth stages of plants from images.
MAIN FUNCTIONALITIES:
- Automatic data split preparation (train/val/test)
- CNN model training (MobileNetV2 by default)
- Data augmentation to improve model robustness
- Comprehensive evaluation with metrics and visualizations
- Automatic saving of models and reports
Usage:
python train_pipeline_vscode.py --images_dir ./Datasets/Artichoke/Artichoke_1
--csv ./Datasets/Artichoke/Artichoke_1.csv
--out_dir ./Datasets/results
--model mobilenet
--epochs 15
Requirements:
- Images must be named as <id_img>.jpg
- CSV file must contain 'id_img' and 'fase' columns
AUTHOR: Sofia Garcia Arcila
DATE: August 2025
VERSION: 1.0
"""
# =============================================================================
# LIBRARY IMPORTS
# =============================================================================
# System and file handling libraries
import os # Operating system interface (paths, directories)
import shutil # High-level file operations (copy, move, delete)
import argparse # Command-line argument parsing
import random # Random number generation for reproducibility
import pandas as pd # Data manipulation and analysis
import numpy as np # Numerical computing with multidimensional arrays
from pathlib import Path # Modern path handling
# Data visualization libraries
import matplotlib.pyplot as plt # Plotting and visualization
import seaborn as sns # Statistical data visualization
# Deep learning libraries(TensorFlow and Keras)
import tensorflow as tf # Deep learning framework
from tensorflow.keras import layers, models # Neural network architecture building
from tensorflow.keras.applications import MobileNetV2 # Pre-trained model optimized for mobile devices
from tensorflow.keras.preprocessing.image import ImageDataGenerator # Data generator with augmentation capabilities
# Traditional machine learning libraries for evaluation
from sklearn.metrics import classification_report, confusion_matrix # Evaluation metrics
# =============================================================================
# COMMAND-LINE ARGUMENT CONFIGURATION
# =============================================================================
def parse_args():
"""
Configure and parse command-line arguments.
This function allows users to customize all important pipeline parameters
without needing to modify the source code.
Returns:
argparse.Namespace: Object containing all parsed arguments
Available arguments:
--images_dir: Directory containing all images (format: id_img.jpg)
--csv: CSV file with columns 'id_img' and 'fase'
--out_dir: Directory where all results will be saved
--img_size: Size to which images will be resized (default: 224px)
--batch_size: Number of images processed simultaneously (default: 32)
--epochs: Maximum number of training epochs (default: 15)
--seed: Seed for result reproducibility (default: 42)
--model: Type of model to train ('mobilenet' or 'simplecnn')
"""
p = argparse.ArgumentParser(
description="Training pipeline for phenological phase classification",
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Show default values in help
)
p.add_argument('--images_dir', required=True)
p.add_argument('--csv', required=True)
p.add_argument('--out_dir', required=True)
p.add_argument('--img_size', type=int, default=224)
p.add_argument('--batch_size', type=int, default=32)
p.add_argument('--epochs', type=int, default=15)
p.add_argument('--seed', type=int, default=42)
p.add_argument('--model', choices=['mobilenet','simplecnn'], default='mobilenet')
return p.parse_args()
# =============================================================================
# DATA PROCESSING FUNCTIONS
# =============================================================================
def prepare_splits(df, images_dir, out_dir, split={'train':0.7,'val':0.15,'test':0.15}, seed=42):
"""
Prepare data division into training, validation, and test sets.
This function organizes images into a directory structure compatible with
Keras ImageDataGenerator, facilitating model training.
Args:
df (pd.DataFrame): DataFrame with columns 'id_img' and 'fase'
images_dir (str): Source directory containing all images
out_dir (str): Directory where organized structure will be created
split (dict): Proportions for each set (train/val/test)
seed (int): Seed for reproducible shuffling
Returns:
tuple: (train_df, val_df, test_df) - DataFrames for each set
Created directory structure:
out_dir/
├── train/
│ ├── phase1/
│ │ ├── image1.jpg
│ │ └── image2.jpg
│ └── phase2/
│ └── image3.jpg
├── val/
│ └── [same structure]
└── test/
└── [same structure]
"""
print("📂 Preparing data splits...")
random.seed(seed) # Set seed for reproducibility
df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
n = len(df_shuffled)
n_train = int(n*split['train'])
n_val = int(n*split['val'])
# Split the DataFrame into three sets
train_df = df_shuffled.iloc[:n_train]
val_df = df_shuffled.iloc[n_train:n_train+n_val]
test_df = df_shuffled.iloc[n_train+n_val:]
# Create directory structure for each set an each phase
labels = df['fase'].unique()
for part in ['train','val','test']:
for lbl in labels:
os.makedirs(os.path.join(out_dir, part, str(lbl)), exist_ok=True)
def copy_subset(subdf, subset_name):
"""
Auxiliary function to copy images from a specific subset.
Args:
subdf (pd.DataFrame): DataFrame of the subset (train/val/test)
subset_name (str): Dataset name ('train', 'val', 'test')
"""
for _, row in subdf.iterrows():
src = os.path.join(images_dir, f"{row['id_img']}.jpg")
dst = os.path.join(out_dir, subset_name, str(row['fase']), f"{row['id_img']}.jpg")
if os.path.exists(src):
shutil.copy(src, dst)
copy_subset(train_df, 'train')
copy_subset(val_df, 'val')
copy_subset(test_df, 'test')
return train_df, val_df, test_df
# =============================================================================
# MODEL BUILDING FUNCTIONS
# =============================================================================
def build_model(img_size, n_classes, kind='mobilenet'):
"""
Build and compile a convolutional neural network model.
Args:
img_size (int): Size of input images (assumes square images)
n_classes (int): Number of classes for classification
kind (str): Model type ('mobilenet' for transfer learning, 'simplecnn' for simple CNN)
Returns:
tf.keras.Model: Compiled model ready for training
Available architectures:
- 'mobilenet': Transfer learning with MobileNetV2 pre-trained on ImageNet
- 'simplecnn': Simple CNN built from scratch
"""
if kind=='mobilenet':
print("🏗️ Building model with Transfer Learning (MobileNetV2)...")
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(img_size,img_size,3))
base_model.trainable = False
model = models.Sequential([
base_model,
layers.GlobalAveragePooling2D(),
layers.Dropout(0.3),
layers.Dense(128, activation='relu'),
layers.Dropout(0.3),
layers.Dense(n_classes, activation='softmax')
])
else:
model = models.Sequential([
layers.Conv2D(32,(3,3),activation='relu',input_shape=(img_size,img_size,3)),
layers.MaxPooling2D(2,2),
layers.Conv2D(64,(3,3),activation='relu'),
layers.MaxPooling2D(2,2),
layers.Flatten(),
layers.Dense(128,activation='relu'),
layers.Dense(n_classes,activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model
# =============================================================================
# VISUALIZATION FUNCTIONS
# =============================================================================
def plot_history(history, out_dir):
"""
Create training history plot (accuracy vs epochs).
Args:
history (tf.keras.callbacks.History): History returned by model.fit()
out_dir (str): Directory where to save the plot
Generates:
- Plot of training vs validation accuracy per epoch
- 'accuracy.png' file saved in out_dir
"""
print("📊 Generating training history plot...")
plt.figure(figsize=(10,6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Evolution during Training', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(out_dir,'accuracy.png'), dpi=300, bbox_inches='tight')
plt.close()
# =============================================================================
# MAIN FUNCTION
# =============================================================================
def main():
"""
Main function that executes the complete training pipeline.
Execution flow:
1. Parse command-line arguments
2. Load and validate data
3. Prepare data splits
4. Configure data generators with augmentation
5. Build and train model
6. Evaluate model on test set
7. Generate reports and visualizations
8. Save trained model and results
"""
print("🚀 STARTING PHENOLOGICAL CLASSIFICATION TRAINING PIPELINE")
print("=" * 80)
args = parse_args()
IMAGES_DIR = args.images_dir
CSV_PATH = args.csv
OUT_DIR = args.out_dir
os.makedirs(OUT_DIR, exist_ok=True)
try:
df = pd.read_csv(CSV_PATH)
print(f" ✅ CSV loaded successfully with UTF-8 encoding")
except UnicodeDecodeError:
print(f" ⚠️ Error with UTF-8, trying Latin-1...")
df = pd.read_csv(CSV_PATH, encoding='latin-1')
print(f" ✅ CSV loaded successfully with Latin-1 encoding")
required_cols = {'id_img','fase'}
if not required_cols.issubset(set(df.columns)):
raise ValueError(f'CSV must contain columns: {required_cols}')
split_dir = os.path.join(OUT_DIR,'split_data')
if os.path.exists(split_dir):
shutil.rmtree(split_dir)
train_df, val_df, test_df = prepare_splits(df, IMAGES_DIR, split_dir, seed=args.seed)
IMG_SIZE = (args.img_size, args.img_size)
BATCH_SIZE = args.batch_size
# Generator for training WITH augmentation
train_datagen = ImageDataGenerator(rescale=1./255, # Normalization: [0,255] → [0,1]
rotation_range=20, # Random rotations
width_shift_range=0.1, # Horizontal shift
height_shift_range=0.1, # Vertical shift
zoom_range=0.1, # Random zoom
horizontal_flip=True) # Random horizontal flips
# Generator for validation and test WITHOUT augmentation
test_datagen = ImageDataGenerator(rescale=1./255)
# Create data flow generators
train_gen = train_datagen.flow_from_directory(os.path.join(split_dir,'train'),
target_size=IMG_SIZE,
batch_size=BATCH_SIZE,
class_mode='categorical')
val_gen = test_datagen.flow_from_directory(os.path.join(split_dir,'val'),
target_size=IMG_SIZE,
batch_size=BATCH_SIZE,
class_mode='categorical')
test_gen = test_datagen.flow_from_directory(os.path.join(split_dir,'test'),
target_size=IMG_SIZE,
batch_size=BATCH_SIZE,
class_mode='categorical',
shuffle=False)
model = build_model(args.img_size, train_gen.num_classes, kind=args.model)
model.summary()
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True),
tf.keras.callbacks.ModelCheckpoint(os.path.join(OUT_DIR,'best_model.h5'), save_best_only=True)]
history = model.fit(train_gen, validation_data=val_gen, epochs=args.epochs, callbacks=callbacks)
model.load_weights(os.path.join(OUT_DIR,'best_model.h5'))
y_pred_prob = model.predict(test_gen, verbose=1)
y_pred = np.argmax(y_pred_prob,axis=1)
y_true = test_gen.classes
class_labels = list(train_gen.class_indices.keys())
# Get unique labels that actually appear in test set
unique_test_labels = np.unique(y_true)
actual_labels = [class_labels[i] for i in unique_test_labels]
report = classification_report(y_true, y_pred, target_names=actual_labels)
cm = confusion_matrix(y_true, y_pred)
print('\nClassification Report:\n', report)
with open(os.path.join(OUT_DIR,'classification_report.txt'),'w', encoding='utf-8') as f:
f.write("CLASSIFICATION REPORT - PHENOLOGY MODEL\n")
f.write("=" * 50 + "\n\n")
f.write(f"Model: {args.model}\n")
f.write(f"Image size: {args.img_size}px\n")
f.write(f"Epochs trained: {len(history.history['accuracy'])}\n")
f.write(f"Best validation accuracy: {max(history.history['val_accuracy']):.4f}\n\n")
f.write(report)
np.savetxt(os.path.join(OUT_DIR,'confusion_matrix.csv'), cm, delimiter=',', fmt='%d')
# Visualizaciones
plot_history(history, OUT_DIR)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=actual_labels, yticklabels=actual_labels, cmap='Blues')
plt.xlabel('Predicción')
plt.ylabel('Verdadero')
plt.title('Matriz de confusión')
plt.savefig(os.path.join(OUT_DIR,'confusion_matrix.png'))
plt.close()
# Guardar modelo final
model.save(os.path.join(OUT_DIR,'final_model.h5'))
print('Resultados guardados en', OUT_DIR)
if __name__=='__main__':
main()