Phenology/Code/GBIF_download/Join_metadatos.py

"""
SCRIPT TO JOIN CSV + JSON METADATA
====================================

This script merges metadata from:
- CSV: change_name.csv (column Old_Name)
- JSON: gbif_metadata.json (field basename)

FUNCTIONALITY:
- Merges both files based on the file name (basename)
- Combines ALL data from both files
- Saves the result in a unified CSV

Author: Sofia Garcia Arcila
Date: October 2025
Version: 1.0
"""

import pandas as pd    # Data manipulation
import json           # JSON file handling
import os            # System operations
from pathlib import Path  # Modern path handling

# =============================================================================
# AUXILIARY FUNCTIONS
# =============================================================================

def extract_basename_from_csv_name(Old_Name):
    """
    Extracts the basename from the previous name of the CSV.

    Args:
        Old_Name (str): Name of the file from the CSV (e.g., "imagen.jpg")

    Returns:
        str: Name of the file without extension (e.g., "imagen")
    """
    if pd.isna(Old_Name) or Old_Name == '':
        return ''

    # Remove the .jpg extension if it exists
    basename = os.path.splitext(Old_Name)[0]
    return basename

def load_csv_data(csv_path):
    """
    Loads the CSV file with error handling for encoding.

    Args:
        csv_path (str): Path to the CSV file

    Returns:
        pd.DataFrame: DataFrame with the CSV data
    """
    try:
        df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"CSV loaded with UTF-8 encoding")
        return df
    except UnicodeDecodeError:
        try:
            # If it fails, try with Latin-1
            df = pd.read_csv(csv_path, encoding='latin-1')
            print(f"CSV loaded with Latin-1 encoding")
            return df
        except Exception as e:
            print(f"Error loading CSV: {e}")
            return None

def load_json_data(json_path):
    """
    Loads the JSON file and converts it to a DataFrame.

    Args:
        json_path (str): Path to the JSON file

    Returns:
        pd.DataFrame: DataFrame with the JSON data
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)

        # Convert to DataFrame
        df = pd.DataFrame(json_data)
        print(f"JSON loaded successfully")
        return df
    except Exception as e:
        print(f"Error loading JSON: {e}")
        return None

def join_csv_json_metadata():
    """
    Main function that joins the metadata from the CSV and JSON.

    Returns:
        bool: True if the process was successful, False otherwise
    """

    print("Starting JOINING CSV + JSON METADATA")
    print("=" * 60)

    # =========================================================================
    # Define PATHS
    # =========================================================================

    csv_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\change_namesAV.csv"
    json_path = r"C:\Users\sof12\Desktop\ML\metadata\gbif_metadata.json"
    output_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\joined_metadata.csv"

    print(f"Files to process:")
    print(f"   • CSV: {csv_path}")
    print(f"   • JSON: {json_path}")
    print(f"   • Output: {output_path}")

    # =========================================================================
    # 2. Verify FILES EXISTENCE
    # =========================================================================

    print(f"\n Verifying files")

    if not os.path.exists(csv_path):
        print(f"Error: No CSV file found at {csv_path}")
        return False

    if not os.path.exists(json_path):
        print(f"Error: No JSON file found at {json_path}")
        return False

    print(f"Both files found")

    # =========================================================================
    # 3. LOAD CSV FILE
    # =========================================================================

    print(f"\n Loading CSV file")

    df_csv = load_csv_data(csv_path)
    if df_csv is None:
        return False

    print(f"Registers in CSV: {len(df_csv)}")
    print(f"Columns in CSV: {list(df_csv.columns)}")

    # Verify that the required column exists
    if 'Old_Name' not in df_csv.columns:
        print(f"Error: Doesn't 'Old_Name' exist in the CSV")
        print(f"   Available columns: {list(df_csv.columns)}")
        return False

    # =========================================================================
    # 4. LOAD JSON FILE
    # =========================================================================

    print(f"\n Loading JSON file...")

    df_json = load_json_data(json_path)
    if df_json is None:
        return False

    print(f"Registers in JSON: {len(df_json)}")
    print(f"Columns in JSON: {len(df_json.columns)}")

    # Verify that the 'basename' field exists
    if 'basename' not in df_json.columns:
        print(f"Error: Wasnt found the 'basename' field in the JSON")
        print(f"   Available columns: {list(df_json.columns)}")
        return False

    # =========================================================================
    # 5. Prepare DATA FOR MERGE
    # =========================================================================

    print(f"\n Preparing data for merge...")

    # Extract basename from CSV (remove .jpg from Old_Name)
    df_csv['basename_csv'] = df_csv['Old_Name'].apply(extract_basename_from_csv_name)

    # The JSON already has the 'basename' field, but we rename it for clarity
    df_json['basename_json'] = df_json['basename']

    # Show statistics
    basenames_csv_unicos = df_csv['basename_csv'].nunique()
    basenames_json_unicos = df_json['basename_json'].nunique()

    print(f"Unique basenames in CSV: {basenames_csv_unicos}")
    print(f"Unique basenames in JSON: {basenames_json_unicos}")

    # Show examples of basenames
    print(f"\n EXAMPLES OF BASENAMES:")
    print("   CSV (first 5):")
    for i, basename in enumerate(df_csv['basename_csv'].head().tolist()):
        print(f"     {i+1}. '{basename}'")

    print("   JSON (first 5):")
    for i, basename in enumerate(df_json['basename_json'].head().tolist()):
        print(f"     {i+1}. '{basename}'")

    # =========================================================================
    # 6. Do the MERGE
    # =========================================================================

    print(f"\n Performing data merge")

    # Perform OUTER merge to keep all records
    df_merged = pd.merge(
        df_csv,
        df_json,
        left_on='basename_csv',
        right_on='basename_json',
        how='outer',  # Mantener TODOS los registros de ambos archivos
        suffixes=('_csv', '_json')
    )

    print(f"Join completed")
    print(f"Total records after merge: {len(df_merged)}")

    # =========================================================================
    # 7. Analyze MERGE RESULTS
    # =========================================================================

    print(f"\n ANALYZE RESULTS:")

    # Calculate merge statistics
    both_found = df_merged['basename_csv'].notna() & df_merged['basename_json'].notna()
    only_csv = df_merged['basename_csv'].notna() & df_merged['basename_json'].isna()
    only_json = df_merged['basename_csv'].isna() & df_merged['basename_json'].notna()

    coincidencias = both_found.sum()
    solo_csv = only_csv.sum()
    solo_json = only_json.sum()

    print(f"   • Founded coincidences: {coincidencias}")
    print(f"   • Just in the CSV: {solo_csv}")
    print(f"   • Just in the JSON: {solo_json}")

    # Calculate percentages
    total_csv = len(df_csv)
    total_json = len(df_json)
    porcentaje_match_csv = (coincidencias / total_csv * 100) if total_csv > 0 else 0
    porcentaje_match_json = (coincidencias / total_json * 100) if total_json > 0 else 0

    print(f"   • Percentage of CSV with match: {porcentaje_match_csv:.1f}%")
    print(f"   • Percentage of JSON with match: {porcentaje_match_json:.1f}%")

    # =========================================================================
    # 8. CLEAN AND ORGANIZE FINAL DATA
    # =========================================================================

    print(f"\n Cleaning and organizing final data...")

    # Create unified basename column
    df_merged['basename_final'] = df_merged['basename_csv'].fillna(df_merged['basename_json'])

    # Reorganize columns: put the most important ones first
    columnas_importantes = [
        'basename_final',
        'Old_Name',
        'New_Name'
    ]

    # Add existing columns
    columnas_importantes = [col for col in columnas_importantes if col in df_merged.columns]

    # Get the rest of the columns
    otras_columnas = [col for col in df_merged.columns
                     if col not in columnas_importantes + ['basename_csv', 'basename_json']]

    # Reorder columns
    columnas_finales = columnas_importantes + otras_columnas
    df_final = df_merged[columnas_finales]

    print(f"Organized data")
    print(f"Columns in the final file: {len(df_final.columns)}")

    # =========================================================================
    # 9. SAVE THE RESULTING FILE
    # =========================================================================

    print(f"\n💾 Saving resulting file...")

    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save CSV
        df_final.to_csv(output_path, index=False, encoding='utf-8')

        # Verify that it was saved correctly
        if os.path.exists(output_path):
            file_size = os.path.getsize(output_path)
            print(f"File saved successfully")
            print(f"Path: {output_path}")
            print(f"Records: {len(df_final)}")
            print(f"Columns: {len(df_final.columns)}")
            print(f"Size: {file_size / 1024:.2f} KB")
        else:
            print(f"Error: The file was not saved correctly")
            return False

    except Exception as e:
        print(f"Error saving file: {e}")
        return False

    # =========================================================================
    # 10. SHOW PREVIEW OF THE RESULT
    # =========================================================================

    print(f"\n SHOWING PREVIEW OF THE RESULT:")

    try:
        # Show column information
        print(f"Columns included:")
        for i, col in enumerate(df_final.columns[:10]):  # Show only the first 10
            print(f"     {i+1:2d}. {col}")

        if len(df_final.columns) > 10:
            print(f"     ... and {len(df_final.columns) - 10} more columns")

        # Show sample data (only some key columns)
        columnas_muestra = ['basename_final', 'Old_Name', 'New_Name']
        columnas_muestra = [col for col in columnas_muestra if col in df_final.columns]

        if columnas_muestra:
            print(f"\n   📊 Sample data (first 5 rows):")
            muestra = df_final[columnas_muestra].head()
            print(muestra.to_string(index=False))

    except Exception as e:
        print(f"   ⚠️  Error showing preview: {e}")

    return True

# =============================================================================
# MAIN FUNCTION
# =============================================================================

def main():
    """
    Main function of the script.
    """
    print("Join CSV + JSON METADATA")
    print("Joining cambios_nombres.csv with gbif_metadata.json")
    print("Based on: basename (JSON) ↔ Old_Name (CSV)")
    print("=" * 60)

    # Ejecutar proceso de unión
    success = join_csv_json_metadata()

    if success:
        print(f"\n PROCESS COMPLETED SUCCESSFULLY!")
        print("Check the file 'metadatos_unidos.csv' in the Nocciola folder")
        print("The file contains ALL data from both original files")
    else:
        print(f"\n The process failed. Check the errors shown above.")

# =============================================================================
# SCRIPT ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    main()