Phenology/Code/GBIF_download/Join_metadatos.py

361 lines
12 KiB
Python

"""
SCRIPT TO JOIN CSV + JSON METADATA
====================================
This script merges metadata from:
- CSV: cambios_nombres.csv (column Nombre_Anterior)
- JSON: gbif_metadata.json (field basename)
FUNCTIONALITY:
- Merges both files based on the file name (basename)
- Combines ALL data from both files
- Saves the result in a unified CSV
Author: Sofia Garcia Arcila
Date: October 2025
Version: 1.0
"""
import pandas as pd # Data manipulation
import json # JSON file handling
import os # System operations
from pathlib import Path # Modern path handling
# =============================================================================
# AUXILIARY FUNCTIONS
# =============================================================================
def extract_basename_from_csv_name(Old_Name):
"""
Extracts the basename from the previous name of the CSV.
Args:
Old_Name (str): Name of the file from the CSV (e.g., "imagen.jpg")
Returns:
str: Name of the file without extension (e.g., "imagen")
"""
if pd.isna(Old_Name) or Old_Name == '':
return ''
# Remove the .jpg extension if it exists
basename = os.path.splitext(Old_Name)[0]
return basename
def load_csv_data(csv_path):
"""
Loads the CSV file with error handling for encoding.
Args:
csv_path (str): Path to the CSV file
Returns:
pd.DataFrame: DataFrame with the CSV data
"""
try:
df = pd.read_csv(csv_path, encoding='utf-8')
print(f"CSV loaded with UTF-8 encoding")
return df
except UnicodeDecodeError:
try:
# If it fails, try with Latin-1
df = pd.read_csv(csv_path, encoding='latin-1')
print(f"CSV loaded with Latin-1 encoding")
return df
except Exception as e:
print(f"Error loading CSV: {e}")
return None
def load_json_data(json_path):
"""
Loads the JSON file and converts it to a DataFrame.
Args:
json_path (str): Path to the JSON file
Returns:
pd.DataFrame: DataFrame with the JSON data
"""
try:
with open(json_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# Convert to DataFrame
df = pd.DataFrame(json_data)
print(f"JSON loaded successfully")
return df
except Exception as e:
print(f"Error loading JSON: {e}")
return None
def join_csv_json_metadata():
"""
Main function that joins the metadata from the CSV and JSON.
Returns:
bool: True if the process was successful, False otherwise
"""
print("Starting JOINING CSV + JSON METADATA")
print("=" * 60)
# =========================================================================
# Define PATHS
# =========================================================================
csv_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\change_namesAV.csv"
json_path = r"C:\Users\sof12\Desktop\ML\metadata\gbif_metadata.json"
output_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\joined_metadata.csv"
print(f"Files to process:")
print(f" • CSV: {csv_path}")
print(f" • JSON: {json_path}")
print(f" • Output: {output_path}")
# =========================================================================
# 2. Verify FILES EXISTENCE
# =========================================================================
print(f"\n Verifying files")
if not os.path.exists(csv_path):
print(f"Error: No CSV file found at {csv_path}")
return False
if not os.path.exists(json_path):
print(f"Error: No JSON file found at {json_path}")
return False
print(f"Both files found")
# =========================================================================
# 3. LOAD CSV FILE
# =========================================================================
print(f"\n Loading CSV file")
df_csv = load_csv_data(csv_path)
if df_csv is None:
return False
print(f"Registers in CSV: {len(df_csv)}")
print(f"Columns in CSV: {list(df_csv.columns)}")
# Verify that the required column exists
if 'Old_Name' not in df_csv.columns:
print(f"Error: Doesn't 'Old_Name' exist in the CSV")
print(f" Available columns: {list(df_csv.columns)}")
return False
# =========================================================================
# 4. LOAD JSON FILE
# =========================================================================
print(f"\n Loading JSON file...")
df_json = load_json_data(json_path)
if df_json is None:
return False
print(f"Registers in JSON: {len(df_json)}")
print(f"Columns in JSON: {len(df_json.columns)}")
# Verify that the 'basename' field exists
if 'basename' not in df_json.columns:
print(f"Error: Wasnt found the 'basename' field in the JSON")
print(f" Available columns: {list(df_json.columns)}")
return False
# =========================================================================
# 5. Prepare DATA FOR MERGE
# =========================================================================
print(f"\n Preparing data for merge...")
# Extract basename from CSV (remove .jpg from Old_Name)
df_csv['basename_csv'] = df_csv['Old_Name'].apply(extract_basename_from_csv_name)
# The JSON already has the 'basename' field, but we rename it for clarity
df_json['basename_json'] = df_json['basename']
# Show statistics
basenames_csv_unicos = df_csv['basename_csv'].nunique()
basenames_json_unicos = df_json['basename_json'].nunique()
print(f"Unique basenames in CSV: {basenames_csv_unicos}")
print(f"Unique basenames in JSON: {basenames_json_unicos}")
# Show examples of basenames
print(f"\n EXAMPLES OF BASENAMES:")
print(" CSV (first 5):")
for i, basename in enumerate(df_csv['basename_csv'].head().tolist()):
print(f" {i+1}. '{basename}'")
print(" JSON (first 5):")
for i, basename in enumerate(df_json['basename_json'].head().tolist()):
print(f" {i+1}. '{basename}'")
# =========================================================================
# 6. Do the MERGE
# =========================================================================
print(f"\n Performing data merge")
# Perform OUTER merge to keep all records
df_merged = pd.merge(
df_csv,
df_json,
left_on='basename_csv',
right_on='basename_json',
how='outer', # Mantener TODOS los registros de ambos archivos
suffixes=('_csv', '_json')
)
print(f"Join completed")
print(f"Total records after merge: {len(df_merged)}")
# =========================================================================
# 7. Analyze MERGE RESULTS
# =========================================================================
print(f"\n ANALYZE RESULTS:")
# Calculate merge statistics
both_found = df_merged['basename_csv'].notna() & df_merged['basename_json'].notna()
only_csv = df_merged['basename_csv'].notna() & df_merged['basename_json'].isna()
only_json = df_merged['basename_csv'].isna() & df_merged['basename_json'].notna()
coincidencias = both_found.sum()
solo_csv = only_csv.sum()
solo_json = only_json.sum()
print(f" • Founded coincidences: {coincidencias}")
print(f" • Just in the CSV: {solo_csv}")
print(f" • Just in the JSON: {solo_json}")
# Calculate percentages
total_csv = len(df_csv)
total_json = len(df_json)
porcentaje_match_csv = (coincidencias / total_csv * 100) if total_csv > 0 else 0
porcentaje_match_json = (coincidencias / total_json * 100) if total_json > 0 else 0
print(f" • Percentage of CSV with match: {porcentaje_match_csv:.1f}%")
print(f" • Percentage of JSON with match: {porcentaje_match_json:.1f}%")
# =========================================================================
# 8. CLEAN AND ORGANIZE FINAL DATA
# =========================================================================
print(f"\n Cleaning and organizing final data...")
# Create unified basename column
df_merged['basename_final'] = df_merged['basename_csv'].fillna(df_merged['basename_json'])
# Reorganize columns: put the most important ones first
columnas_importantes = [
'basename_final',
'Old_Name',
'New_Name'
]
# Add existing columns
columnas_importantes = [col for col in columnas_importantes if col in df_merged.columns]
# Get the rest of the columns
otras_columnas = [col for col in df_merged.columns
if col not in columnas_importantes + ['basename_csv', 'basename_json']]
# Reorder columns
columnas_finales = columnas_importantes + otras_columnas
df_final = df_merged[columnas_finales]
print(f"Organized data")
print(f"Columns in the final file: {len(df_final.columns)}")
# =========================================================================
# 9. SAVE THE RESULTING FILE
# =========================================================================
print(f"\n💾 Saving resulting file...")
try:
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Save CSV
df_final.to_csv(output_path, index=False, encoding='utf-8')
# Verify that it was saved correctly
if os.path.exists(output_path):
file_size = os.path.getsize(output_path)
print(f"File saved successfully")
print(f"Path: {output_path}")
print(f"Records: {len(df_final)}")
print(f"Columns: {len(df_final.columns)}")
print(f"Size: {file_size / 1024:.2f} KB")
else:
print(f"Error: The file was not saved correctly")
return False
except Exception as e:
print(f"Error saving file: {e}")
return False
# =========================================================================
# 10. SHOW PREVIEW OF THE RESULT
# =========================================================================
print(f"\n SHOWING PREVIEW OF THE RESULT:")
try:
# Show column information
print(f"Columns included:")
for i, col in enumerate(df_final.columns[:10]): # Show only the first 10
print(f" {i+1:2d}. {col}")
if len(df_final.columns) > 10:
print(f" ... and {len(df_final.columns) - 10} more columns")
# Show sample data (only some key columns)
columnas_muestra = ['basename_final', 'Old_Name', 'New_Name']
columnas_muestra = [col for col in columnas_muestra if col in df_final.columns]
if columnas_muestra:
print(f"\n 📊 Sample data (first 5 rows):")
muestra = df_final[columnas_muestra].head()
print(muestra.to_string(index=False))
except Exception as e:
print(f" ⚠️ Error showing preview: {e}")
return True
# =============================================================================
# MAIN FUNCTION
# =============================================================================
def main():
"""
Main function of the script.
"""
print("Join CSV + JSON METADATA")
print("Joining cambios_nombres.csv with gbif_metadata.json")
print("Based on: basename (JSON) ↔ Old_Name (CSV)")
print("=" * 60)
# Ejecutar proceso de unión
success = join_csv_json_metadata()
if success:
print(f"\n PROCESS COMPLETED SUCCESSFULLY!")
print("Check the file 'metadatos_unidos.csv' in the Nocciola folder")
print("The file contains ALL data from both original files")
else:
print(f"\n The process failed. Check the errors shown above.")
# =============================================================================
# SCRIPT ENTRY POINT
# =============================================================================
if __name__ == "__main__":
main()