Phenology/Code/GBIF_download/metadatos.py

163 lines
5.2 KiB
Python

"""
Function to obtain the metadata from GBIF using the gbif_dl library
Important: This script requires an internet connection.
Specie: Corylus L. (hazelnut)
taxonKey = 2875967
License: CC BY 4.0
Specie: Cynara Cardunculus L. (artichoke)
taxonKey = 3112364
License: CC BY 4.0
"""
import gbif_dl
import json
import os
import pandas as pd
def get_gbif_metadata():
"""
Obtains only the metadata from GBIF for the specified species.
Doesn't download images, only collects information.
"""
try:
print("Configuring GBIF query")
query = {
"taxonKey": [3112364], # Taxon Key for the specified specie
"license": ["CC_BY_4_0"]
}
print("Generating URLs and obtaining metadata...")
# Generate data URLs
data_gen = gbif_dl.api.generate_urls(
queries=query,
label="taxonKey",
nb_samples=8000,
)
# Create directory to save metadata
metadata_dir = "metadata"
os.makedirs(metadata_dir, exist_ok=True)
print(f"📁 Directory '{metadata_dir}' created/verified")
metadata_list = []
count = 0
print("Collecting metadata")
# Iterate over each item ONLY to obtain metadata
for i, item in enumerate(data_gen, 1):
try:
# Just add metadata to the list (NO image download)
metadata_list.append(item)
count += 1
# Show progress every 100 items
if count % 100 == 0:
print(f"Processed {count} metadata...")
except Exception as e:
print(f" ⚠️ Error in item {i}: {str(e)[:100]}...")
continue
print(f"\n💾 Saving {count} metadata...")
# Save complete metadata to JSON
metadata_file = os.path.join(metadata_dir, "gbif_metadata.json")
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(metadata_list, f, indent=2, ensure_ascii=False)
# Create summary CSV
if metadata_list:
create_summary_csv(metadata_list, metadata_dir)
print(f"Full process completed:")
print(f" • Metadata collected: {count}")
print(f" • Full JSON file: {metadata_file}")
print(f" • Summary CSV file: {os.path.join(metadata_dir, 'gbif_summary.csv')}")
return count > 0
except Exception as e:
print(f"Error: {e}")
return False
def create_summary_csv(metadata_list, output_dir):
"""
Creates a CSV file with a summary of the most important metadata.
"""
try:
# Extract key information from each metadata
summary_data = []
for item in metadata_list:
summary_item = {
'gbif_id': item.get('gbifID', ''),
'species': item.get('species', ''),
'genus': item.get('genus', ''),
'family': item.get('family', ''),
'country': item.get('country', ''),
'locality': item.get('locality', ''),
'latitude': item.get('decimalLatitude', ''),
'longitude': item.get('decimalLongitude', ''),
'date': item.get('eventDate', ''),
'collector': item.get('recordedBy', ''),
'institution': item.get('institutionCode', ''),
'catalog_number': item.get('catalogNumber', ''),
'license': item.get('license', ''),
'image_url': item.get('identifier', ''),
'basis_of_record': item.get('basisOfRecord', '')
}
summary_data.append(summary_item)
# Create DataFrame and save CSV
df = pd.DataFrame(summary_data)
csv_file = os.path.join(output_dir, 'gbif_summary.csv')
df.to_csv(csv_file, index=False, encoding='utf-8')
print(f"CSV created with {len(summary_data)} records")
except Exception as e:
print(f"Error creating CSV: {e}")
def show_metadata_preview(metadata_dir):
"""
Shows a preview of the collected metadata.
"""
try:
csv_file = os.path.join(metadata_dir, 'gbif_summary.csv')
if os.path.exists(csv_file):
df = pd.read_csv(csv_file)
print(f"\n METADATA PREVIEW:")
print(f" • Total records: {len(df)}")
print(f" • Unique countries: {df['country'].nunique()}")
print(f" • Unique institutions: {df['institution'].nunique()}")
print(f"\n🔝 Top 3 records:")
print(df.head(3).to_string(index=False))
except Exception as e:
print(f" ⚠️ Error showing preview: {e}")
def main():
"""
Main function of the script.
"""
print("Start recolecting metadata GBIF")
print("=" * 50)
# Executing metadata collection
success = get_gbif_metadata()
if success:
print("\n Metadata collected successfully!")
print("Check the 'metadata' folder to see the generated files")
show_metadata_preview("metadata")
else:
print("\n The process failed.")
if __name__ == "__main__":
main()