361 lines
12 KiB
Python
361 lines
12 KiB
Python
"""
|
|
SCRIPT TO JOIN CSV + JSON METADATA
|
|
====================================
|
|
|
|
This script merges metadata from:
|
|
- CSV: change_name.csv (column Old_Name)
|
|
- JSON: gbif_metadata.json (field basename)
|
|
|
|
FUNCTIONALITY:
|
|
- Merges both files based on the file name (basename)
|
|
- Combines ALL data from both files
|
|
- Saves the result in a unified CSV
|
|
|
|
Author: Sofia Garcia Arcila
|
|
Date: October 2025
|
|
Version: 1.0
|
|
"""
|
|
|
|
import pandas as pd # Data manipulation
|
|
import json # JSON file handling
|
|
import os # System operations
|
|
from pathlib import Path # Modern path handling
|
|
|
|
# =============================================================================
|
|
# AUXILIARY FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def extract_basename_from_csv_name(Old_Name):
|
|
"""
|
|
Extracts the basename from the previous name of the CSV.
|
|
|
|
Args:
|
|
Old_Name (str): Name of the file from the CSV (e.g., "imagen.jpg")
|
|
|
|
Returns:
|
|
str: Name of the file without extension (e.g., "imagen")
|
|
"""
|
|
if pd.isna(Old_Name) or Old_Name == '':
|
|
return ''
|
|
|
|
# Remove the .jpg extension if it exists
|
|
basename = os.path.splitext(Old_Name)[0]
|
|
return basename
|
|
|
|
def load_csv_data(csv_path):
|
|
"""
|
|
Loads the CSV file with error handling for encoding.
|
|
|
|
Args:
|
|
csv_path (str): Path to the CSV file
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with the CSV data
|
|
"""
|
|
try:
|
|
df = pd.read_csv(csv_path, encoding='utf-8')
|
|
print(f"CSV loaded with UTF-8 encoding")
|
|
return df
|
|
except UnicodeDecodeError:
|
|
try:
|
|
# If it fails, try with Latin-1
|
|
df = pd.read_csv(csv_path, encoding='latin-1')
|
|
print(f"CSV loaded with Latin-1 encoding")
|
|
return df
|
|
except Exception as e:
|
|
print(f"Error loading CSV: {e}")
|
|
return None
|
|
|
|
def load_json_data(json_path):
|
|
"""
|
|
Loads the JSON file and converts it to a DataFrame.
|
|
|
|
Args:
|
|
json_path (str): Path to the JSON file
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with the JSON data
|
|
"""
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
json_data = json.load(f)
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(json_data)
|
|
print(f"JSON loaded successfully")
|
|
return df
|
|
except Exception as e:
|
|
print(f"Error loading JSON: {e}")
|
|
return None
|
|
|
|
def join_csv_json_metadata():
|
|
"""
|
|
Main function that joins the metadata from the CSV and JSON.
|
|
|
|
Returns:
|
|
bool: True if the process was successful, False otherwise
|
|
"""
|
|
|
|
print("Starting JOINING CSV + JSON METADATA")
|
|
print("=" * 60)
|
|
|
|
# =========================================================================
|
|
# Define PATHS
|
|
# =========================================================================
|
|
|
|
csv_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\change_namesAV.csv"
|
|
json_path = r"C:\Users\sof12\Desktop\ML\metadata\gbif_metadata.json"
|
|
output_path = r"C:\Users\sof12\Desktop\ML\dataset\Carciofo\joined_metadata.csv"
|
|
|
|
print(f"Files to process:")
|
|
print(f" • CSV: {csv_path}")
|
|
print(f" • JSON: {json_path}")
|
|
print(f" • Output: {output_path}")
|
|
|
|
# =========================================================================
|
|
# 2. Verify FILES EXISTENCE
|
|
# =========================================================================
|
|
|
|
print(f"\n Verifying files")
|
|
|
|
if not os.path.exists(csv_path):
|
|
print(f"Error: No CSV file found at {csv_path}")
|
|
return False
|
|
|
|
if not os.path.exists(json_path):
|
|
print(f"Error: No JSON file found at {json_path}")
|
|
return False
|
|
|
|
print(f"Both files found")
|
|
|
|
# =========================================================================
|
|
# 3. LOAD CSV FILE
|
|
# =========================================================================
|
|
|
|
print(f"\n Loading CSV file")
|
|
|
|
df_csv = load_csv_data(csv_path)
|
|
if df_csv is None:
|
|
return False
|
|
|
|
print(f"Registers in CSV: {len(df_csv)}")
|
|
print(f"Columns in CSV: {list(df_csv.columns)}")
|
|
|
|
# Verify that the required column exists
|
|
if 'Old_Name' not in df_csv.columns:
|
|
print(f"Error: Doesn't 'Old_Name' exist in the CSV")
|
|
print(f" Available columns: {list(df_csv.columns)}")
|
|
return False
|
|
|
|
# =========================================================================
|
|
# 4. LOAD JSON FILE
|
|
# =========================================================================
|
|
|
|
print(f"\n Loading JSON file...")
|
|
|
|
df_json = load_json_data(json_path)
|
|
if df_json is None:
|
|
return False
|
|
|
|
print(f"Registers in JSON: {len(df_json)}")
|
|
print(f"Columns in JSON: {len(df_json.columns)}")
|
|
|
|
# Verify that the 'basename' field exists
|
|
if 'basename' not in df_json.columns:
|
|
print(f"Error: Wasnt found the 'basename' field in the JSON")
|
|
print(f" Available columns: {list(df_json.columns)}")
|
|
return False
|
|
|
|
# =========================================================================
|
|
# 5. Prepare DATA FOR MERGE
|
|
# =========================================================================
|
|
|
|
print(f"\n Preparing data for merge...")
|
|
|
|
# Extract basename from CSV (remove .jpg from Old_Name)
|
|
df_csv['basename_csv'] = df_csv['Old_Name'].apply(extract_basename_from_csv_name)
|
|
|
|
# The JSON already has the 'basename' field, but we rename it for clarity
|
|
df_json['basename_json'] = df_json['basename']
|
|
|
|
# Show statistics
|
|
basenames_csv_unicos = df_csv['basename_csv'].nunique()
|
|
basenames_json_unicos = df_json['basename_json'].nunique()
|
|
|
|
print(f"Unique basenames in CSV: {basenames_csv_unicos}")
|
|
print(f"Unique basenames in JSON: {basenames_json_unicos}")
|
|
|
|
# Show examples of basenames
|
|
print(f"\n EXAMPLES OF BASENAMES:")
|
|
print(" CSV (first 5):")
|
|
for i, basename in enumerate(df_csv['basename_csv'].head().tolist()):
|
|
print(f" {i+1}. '{basename}'")
|
|
|
|
print(" JSON (first 5):")
|
|
for i, basename in enumerate(df_json['basename_json'].head().tolist()):
|
|
print(f" {i+1}. '{basename}'")
|
|
|
|
# =========================================================================
|
|
# 6. Do the MERGE
|
|
# =========================================================================
|
|
|
|
print(f"\n Performing data merge")
|
|
|
|
# Perform OUTER merge to keep all records
|
|
df_merged = pd.merge(
|
|
df_csv,
|
|
df_json,
|
|
left_on='basename_csv',
|
|
right_on='basename_json',
|
|
how='outer', # Mantener TODOS los registros de ambos archivos
|
|
suffixes=('_csv', '_json')
|
|
)
|
|
|
|
print(f"Join completed")
|
|
print(f"Total records after merge: {len(df_merged)}")
|
|
|
|
# =========================================================================
|
|
# 7. Analyze MERGE RESULTS
|
|
# =========================================================================
|
|
|
|
print(f"\n ANALYZE RESULTS:")
|
|
|
|
# Calculate merge statistics
|
|
both_found = df_merged['basename_csv'].notna() & df_merged['basename_json'].notna()
|
|
only_csv = df_merged['basename_csv'].notna() & df_merged['basename_json'].isna()
|
|
only_json = df_merged['basename_csv'].isna() & df_merged['basename_json'].notna()
|
|
|
|
coincidencias = both_found.sum()
|
|
solo_csv = only_csv.sum()
|
|
solo_json = only_json.sum()
|
|
|
|
print(f" • Founded coincidences: {coincidencias}")
|
|
print(f" • Just in the CSV: {solo_csv}")
|
|
print(f" • Just in the JSON: {solo_json}")
|
|
|
|
# Calculate percentages
|
|
total_csv = len(df_csv)
|
|
total_json = len(df_json)
|
|
porcentaje_match_csv = (coincidencias / total_csv * 100) if total_csv > 0 else 0
|
|
porcentaje_match_json = (coincidencias / total_json * 100) if total_json > 0 else 0
|
|
|
|
print(f" • Percentage of CSV with match: {porcentaje_match_csv:.1f}%")
|
|
print(f" • Percentage of JSON with match: {porcentaje_match_json:.1f}%")
|
|
|
|
# =========================================================================
|
|
# 8. CLEAN AND ORGANIZE FINAL DATA
|
|
# =========================================================================
|
|
|
|
print(f"\n Cleaning and organizing final data...")
|
|
|
|
# Create unified basename column
|
|
df_merged['basename_final'] = df_merged['basename_csv'].fillna(df_merged['basename_json'])
|
|
|
|
# Reorganize columns: put the most important ones first
|
|
columnas_importantes = [
|
|
'basename_final',
|
|
'Old_Name',
|
|
'New_Name'
|
|
]
|
|
|
|
# Add existing columns
|
|
columnas_importantes = [col for col in columnas_importantes if col in df_merged.columns]
|
|
|
|
# Get the rest of the columns
|
|
otras_columnas = [col for col in df_merged.columns
|
|
if col not in columnas_importantes + ['basename_csv', 'basename_json']]
|
|
|
|
# Reorder columns
|
|
columnas_finales = columnas_importantes + otras_columnas
|
|
df_final = df_merged[columnas_finales]
|
|
|
|
print(f"Organized data")
|
|
print(f"Columns in the final file: {len(df_final.columns)}")
|
|
|
|
# =========================================================================
|
|
# 9. SAVE THE RESULTING FILE
|
|
# =========================================================================
|
|
|
|
print(f"\n💾 Saving resulting file...")
|
|
|
|
try:
|
|
# Create directory if it doesn't exist
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Save CSV
|
|
df_final.to_csv(output_path, index=False, encoding='utf-8')
|
|
|
|
# Verify that it was saved correctly
|
|
if os.path.exists(output_path):
|
|
file_size = os.path.getsize(output_path)
|
|
print(f"File saved successfully")
|
|
print(f"Path: {output_path}")
|
|
print(f"Records: {len(df_final)}")
|
|
print(f"Columns: {len(df_final.columns)}")
|
|
print(f"Size: {file_size / 1024:.2f} KB")
|
|
else:
|
|
print(f"Error: The file was not saved correctly")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error saving file: {e}")
|
|
return False
|
|
|
|
# =========================================================================
|
|
# 10. SHOW PREVIEW OF THE RESULT
|
|
# =========================================================================
|
|
|
|
print(f"\n SHOWING PREVIEW OF THE RESULT:")
|
|
|
|
try:
|
|
# Show column information
|
|
print(f"Columns included:")
|
|
for i, col in enumerate(df_final.columns[:10]): # Show only the first 10
|
|
print(f" {i+1:2d}. {col}")
|
|
|
|
if len(df_final.columns) > 10:
|
|
print(f" ... and {len(df_final.columns) - 10} more columns")
|
|
|
|
# Show sample data (only some key columns)
|
|
columnas_muestra = ['basename_final', 'Old_Name', 'New_Name']
|
|
columnas_muestra = [col for col in columnas_muestra if col in df_final.columns]
|
|
|
|
if columnas_muestra:
|
|
print(f"\n 📊 Sample data (first 5 rows):")
|
|
muestra = df_final[columnas_muestra].head()
|
|
print(muestra.to_string(index=False))
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Error showing preview: {e}")
|
|
|
|
return True
|
|
|
|
# =============================================================================
|
|
# MAIN FUNCTION
|
|
# =============================================================================
|
|
|
|
def main():
|
|
"""
|
|
Main function of the script.
|
|
"""
|
|
print("Join CSV + JSON METADATA")
|
|
print("Joining cambios_nombres.csv with gbif_metadata.json")
|
|
print("Based on: basename (JSON) ↔ Old_Name (CSV)")
|
|
print("=" * 60)
|
|
|
|
# Ejecutar proceso de unión
|
|
success = join_csv_json_metadata()
|
|
|
|
if success:
|
|
print(f"\n PROCESS COMPLETED SUCCESSFULLY!")
|
|
print("Check the file 'metadatos_unidos.csv' in the Nocciola folder")
|
|
print("The file contains ALL data from both original files")
|
|
else:
|
|
print(f"\n The process failed. Check the errors shown above.")
|
|
|
|
# =============================================================================
|
|
# SCRIPT ENTRY POINT
|
|
# =============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
main() |