""" SCRIPT TO JOIN CSV + JSON METADATA ==================================== This script merges metadata from: - CSV: change_name.csv (column Old_Name) - JSON: gbif_metadata.json (field basename) FUNCTIONALITY: - Merges both files based on the file name (basename) - Combines ALL data from both files - Saves the result in a unified CSV Author: Sofia Garcia Arcila Date: October 2025 Version: 1.0 """ import pandas as pd # Data manipulation import json # JSON file handling import os # System operations from pathlib import Path # Modern path handling # ============================================================================= # AUXILIARY FUNCTIONS # ============================================================================= def extract_basename_from_csv_name(Old_Name): """ Extracts the basename from the previous name of the CSV. Args: Old_Name (str): Name of the file from the CSV (e.g., "imagen.jpg") Returns: str: Name of the file without extension (e.g., "imagen") """ if pd.isna(Old_Name) or Old_Name == '': return '' # Remove the .jpg extension if it exists basename = os.path.splitext(Old_Name)[0] return basename def load_csv_data(csv_path): """ Loads the CSV file with error handling for encoding. Args: csv_path (str): Path to the CSV file Returns: pd.DataFrame: DataFrame with the CSV data """ try: df = pd.read_csv(csv_path, encoding='utf-8') print(f"CSV loaded with UTF-8 encoding") return df except UnicodeDecodeError: try: # If it fails, try with Latin-1 df = pd.read_csv(csv_path, encoding='latin-1') print(f"CSV loaded with Latin-1 encoding") return df except Exception as e: print(f"Error loading CSV: {e}") return None def load_json_data(json_path): """ Loads the JSON file and converts it to a DataFrame. Args: json_path (str): Path to the JSON file Returns: pd.DataFrame: DataFrame with the JSON data """ try: with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) # Convert to DataFrame df = pd.DataFrame(json_data) print(f"JSON loaded successfully") return df except Exception as e: print(f"Error loading JSON: {e}") return None def join_csv_json_metadata(): """ Main function that joins the metadata from the CSV and JSON. Returns: bool: True if the process was successful, False otherwise """ print("Starting JOINING CSV + JSON METADATA") print("=" * 60) # ========================================================================= # Define PATHS # ========================================================================= csv_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\change_namesAV.csv" json_path = r"C:\Users\sof12\Desktop\ML\metadata\gbif_metadata.json" output_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\joined_metadata.csv" print(f"Files to process:") print(f" • CSV: {csv_path}") print(f" • JSON: {json_path}") print(f" • Output: {output_path}") # ========================================================================= # 2. Verify FILES EXISTENCE # ========================================================================= print(f"\n Verifying files") if not os.path.exists(csv_path): print(f"Error: No CSV file found at {csv_path}") return False if not os.path.exists(json_path): print(f"Error: No JSON file found at {json_path}") return False print(f"Both files found") # ========================================================================= # 3. LOAD CSV FILE # ========================================================================= print(f"\n Loading CSV file") df_csv = load_csv_data(csv_path) if df_csv is None: return False print(f"Registers in CSV: {len(df_csv)}") print(f"Columns in CSV: {list(df_csv.columns)}") # Verify that the required column exists if 'Old_Name' not in df_csv.columns: print(f"Error: Doesn't 'Old_Name' exist in the CSV") print(f" Available columns: {list(df_csv.columns)}") return False # ========================================================================= # 4. LOAD JSON FILE # ========================================================================= print(f"\n Loading JSON file...") df_json = load_json_data(json_path) if df_json is None: return False print(f"Registers in JSON: {len(df_json)}") print(f"Columns in JSON: {len(df_json.columns)}") # Verify that the 'basename' field exists if 'basename' not in df_json.columns: print(f"Error: Wasnt found the 'basename' field in the JSON") print(f" Available columns: {list(df_json.columns)}") return False # ========================================================================= # 5. Prepare DATA FOR MERGE # ========================================================================= print(f"\n Preparing data for merge...") # Extract basename from CSV (remove .jpg from Old_Name) df_csv['basename_csv'] = df_csv['Old_Name'].apply(extract_basename_from_csv_name) # The JSON already has the 'basename' field, but we rename it for clarity df_json['basename_json'] = df_json['basename'] # Show statistics basenames_csv_unicos = df_csv['basename_csv'].nunique() basenames_json_unicos = df_json['basename_json'].nunique() print(f"Unique basenames in CSV: {basenames_csv_unicos}") print(f"Unique basenames in JSON: {basenames_json_unicos}") # Show examples of basenames print(f"\n EXAMPLES OF BASENAMES:") print(" CSV (first 5):") for i, basename in enumerate(df_csv['basename_csv'].head().tolist()): print(f" {i+1}. '{basename}'") print(" JSON (first 5):") for i, basename in enumerate(df_json['basename_json'].head().tolist()): print(f" {i+1}. '{basename}'") # ========================================================================= # 6. Do the MERGE # ========================================================================= print(f"\n Performing data merge") # Perform OUTER merge to keep all records df_merged = pd.merge( df_csv, df_json, left_on='basename_csv', right_on='basename_json', how='outer', # Mantener TODOS los registros de ambos archivos suffixes=('_csv', '_json') ) print(f"Join completed") print(f"Total records after merge: {len(df_merged)}") # ========================================================================= # 7. Analyze MERGE RESULTS # ========================================================================= print(f"\n ANALYZE RESULTS:") # Calculate merge statistics both_found = df_merged['basename_csv'].notna() & df_merged['basename_json'].notna() only_csv = df_merged['basename_csv'].notna() & df_merged['basename_json'].isna() only_json = df_merged['basename_csv'].isna() & df_merged['basename_json'].notna() coincidencias = both_found.sum() solo_csv = only_csv.sum() solo_json = only_json.sum() print(f" • Founded coincidences: {coincidencias}") print(f" • Just in the CSV: {solo_csv}") print(f" • Just in the JSON: {solo_json}") # Calculate percentages total_csv = len(df_csv) total_json = len(df_json) porcentaje_match_csv = (coincidencias / total_csv * 100) if total_csv > 0 else 0 porcentaje_match_json = (coincidencias / total_json * 100) if total_json > 0 else 0 print(f" • Percentage of CSV with match: {porcentaje_match_csv:.1f}%") print(f" • Percentage of JSON with match: {porcentaje_match_json:.1f}%") # ========================================================================= # 8. CLEAN AND ORGANIZE FINAL DATA # ========================================================================= print(f"\n Cleaning and organizing final data...") # Create unified basename column df_merged['basename_final'] = df_merged['basename_csv'].fillna(df_merged['basename_json']) # Reorganize columns: put the most important ones first columnas_importantes = [ 'basename_final', 'Old_Name', 'New_Name' ] # Add existing columns columnas_importantes = [col for col in columnas_importantes if col in df_merged.columns] # Get the rest of the columns otras_columnas = [col for col in df_merged.columns if col not in columnas_importantes + ['basename_csv', 'basename_json']] # Reorder columns columnas_finales = columnas_importantes + otras_columnas df_final = df_merged[columnas_finales] print(f"Organized data") print(f"Columns in the final file: {len(df_final.columns)}") # ========================================================================= # 9. SAVE THE RESULTING FILE # ========================================================================= print(f"\n💾 Saving resulting file...") try: # Create directory if it doesn't exist os.makedirs(os.path.dirname(output_path), exist_ok=True) # Save CSV df_final.to_csv(output_path, index=False, encoding='utf-8') # Verify that it was saved correctly if os.path.exists(output_path): file_size = os.path.getsize(output_path) print(f"File saved successfully") print(f"Path: {output_path}") print(f"Records: {len(df_final)}") print(f"Columns: {len(df_final.columns)}") print(f"Size: {file_size / 1024:.2f} KB") else: print(f"Error: The file was not saved correctly") return False except Exception as e: print(f"Error saving file: {e}") return False # ========================================================================= # 10. SHOW PREVIEW OF THE RESULT # ========================================================================= print(f"\n SHOWING PREVIEW OF THE RESULT:") try: # Show column information print(f"Columns included:") for i, col in enumerate(df_final.columns[:10]): # Show only the first 10 print(f" {i+1:2d}. {col}") if len(df_final.columns) > 10: print(f" ... and {len(df_final.columns) - 10} more columns") # Show sample data (only some key columns) columnas_muestra = ['basename_final', 'Old_Name', 'New_Name'] columnas_muestra = [col for col in columnas_muestra if col in df_final.columns] if columnas_muestra: print(f"\n 📊 Sample data (first 5 rows):") muestra = df_final[columnas_muestra].head() print(muestra.to_string(index=False)) except Exception as e: print(f" ⚠️ Error showing preview: {e}") return True # ============================================================================= # MAIN FUNCTION # ============================================================================= def main(): """ Main function of the script. """ print("Join CSV + JSON METADATA") print("Joining cambios_nombres.csv with gbif_metadata.json") print("Based on: basename (JSON) ↔ Old_Name (CSV)") print("=" * 60) # Ejecutar proceso de unión success = join_csv_json_metadata() if success: print(f"\n PROCESS COMPLETED SUCCESSFULLY!") print("Check the file 'metadatos_unidos.csv' in the Nocciola folder") print("The file contains ALL data from both original files") else: print(f"\n The process failed. Check the errors shown above.") # ============================================================================= # SCRIPT ENTRY POINT # ============================================================================= if __name__ == "__main__": main()