Phenology/Code/GBIF_download/metadatos.py

"""
Function to obtain the metadata from GBIF using the gbif_dl library
Important: This script requires an internet connection.

Specie: Corylus L. (hazelnut)
taxonKey = 2875967
License: CC BY 4.0
Specie: Cynara Cardunculus L. (artichoke)
taxonKey = 3112364
License: CC BY 4.0
"""

import gbif_dl
import json
import os
import pandas as pd

def get_gbif_metadata():
    """
    Obtains only the metadata from GBIF for the specified species.
    Doesn't download images, only collects information.
    """
    try:
        print("Configuring GBIF query")

        query = {
            "taxonKey": [3112364],  # Taxon Key for the specified specie
            "license": ["CC_BY_4_0"]
        }

        print("Generating URLs and obtaining metadata...")

        # Generate data URLs
        data_gen = gbif_dl.api.generate_urls(
            queries=query,
            label="taxonKey",
            nb_samples=8000,
        )

        # Create directory to save metadata
        metadata_dir = "metadata"
        os.makedirs(metadata_dir, exist_ok=True)
        print(f"📁 Directory '{metadata_dir}' created/verified")

        metadata_list = []
        count = 0

        print("Collecting metadata")

        # Iterate over each item ONLY to obtain metadata
        for i, item in enumerate(data_gen, 1):
            try:
                # Just add metadata to the list (NO image download)
                metadata_list.append(item)
                count += 1

                # Show progress every 100 items
                if count % 100 == 0:
                    print(f"Processed {count} metadata...")

            except Exception as e:
                print(f"   ⚠️  Error in item {i}: {str(e)[:100]}...")
                continue

        print(f"\n💾 Saving {count} metadata...")

        # Save complete metadata to JSON
        metadata_file = os.path.join(metadata_dir, "gbif_metadata.json")
        with open(metadata_file, "w", encoding="utf-8") as f:
            json.dump(metadata_list, f, indent=2, ensure_ascii=False)

        # Create summary CSV
        if metadata_list:
            create_summary_csv(metadata_list, metadata_dir)

        print(f"Full process completed:")
        print(f"   • Metadata collected: {count}")
        print(f"   • Full JSON file: {metadata_file}")
        print(f"   • Summary CSV file: {os.path.join(metadata_dir, 'gbif_summary.csv')}")

        return count > 0

    except Exception as e:
        print(f"Error: {e}")
        return False

def create_summary_csv(metadata_list, output_dir):
    """
    Creates a CSV file with a summary of the most important metadata.
    """
    try:
        # Extract key information from each metadata
        summary_data = []

        for item in metadata_list:
            summary_item = {
                'gbif_id': item.get('gbifID', ''),
                'species': item.get('species', ''),
                'genus': item.get('genus', ''),
                'family': item.get('family', ''),
                'country': item.get('country', ''),
                'locality': item.get('locality', ''),
                'latitude': item.get('decimalLatitude', ''),
                'longitude': item.get('decimalLongitude', ''),
                'date': item.get('eventDate', ''),
                'collector': item.get('recordedBy', ''),
                'institution': item.get('institutionCode', ''),
                'catalog_number': item.get('catalogNumber', ''),
                'license': item.get('license', ''),
                'image_url': item.get('identifier', ''),
                'basis_of_record': item.get('basisOfRecord', '')
            }
            summary_data.append(summary_item)

        # Create DataFrame and save CSV
        df = pd.DataFrame(summary_data)
        csv_file = os.path.join(output_dir, 'gbif_summary.csv')
        df.to_csv(csv_file, index=False, encoding='utf-8')

        print(f"CSV created with {len(summary_data)} records")

    except Exception as e:
        print(f"Error creating CSV: {e}")

def show_metadata_preview(metadata_dir):
    """
    Shows a preview of the collected metadata.
    """
    try:
        csv_file = os.path.join(metadata_dir, 'gbif_summary.csv')
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            print(f"\n METADATA PREVIEW:")
            print(f"   • Total records: {len(df)}")
            print(f"   • Unique countries: {df['country'].nunique()}")
            print(f"   • Unique institutions: {df['institution'].nunique()}")
            print(f"\n🔝 Top 3 records:")
            print(df.head(3).to_string(index=False))

    except Exception as e:
        print(f"   ⚠️  Error showing preview: {e}")

def main():
    """
    Main function of the script.
    """
    print("Start recolecting metadata GBIF")
    print("=" * 50)

    # Executing metadata collection
    success = get_gbif_metadata()

    if success:
        print("\n Metadata collected successfully!")
        print("Check the 'metadata' folder to see the generated files")

        show_metadata_preview("metadata")

    else:
        print("\n The process failed.")

if __name__ == "__main__":
    main()