""" Function to obtain the metadata from GBIF using the gbif_dl library Important: This script requires an internet connection. Specie: Corylus L. (hazelnut) taxonKey = 2875967 License: CC BY 4.0 Specie: Cynara Cardunculus L. (artichoke) taxonKey = 3112364 License: CC BY 4.0 """ import gbif_dl import json import os import pandas as pd def get_gbif_metadata(): """ Obtains only the metadata from GBIF for the specified species. Doesn't download images, only collects information. """ try: print("Configuring GBIF query") query = { "taxonKey": [3112364], # Taxon Key for the specified specie "license": ["CC_BY_4_0"] } print("Generating URLs and obtaining metadata...") # Generate data URLs data_gen = gbif_dl.api.generate_urls( queries=query, label="taxonKey", nb_samples=8000, ) # Create directory to save metadata metadata_dir = "metadata" os.makedirs(metadata_dir, exist_ok=True) print(f" Directory '{metadata_dir}' created/verified") metadata_list = [] count = 0 print("Collecting metadata") # Iterate over each item ONLY to obtain metadata for i, item in enumerate(data_gen, 1): try: # Just add metadata to the list (NO image download) metadata_list.append(item) count += 1 # Show progress every 100 items if count % 100 == 0: print(f"Processed {count} metadata...") except Exception as e: print(f" Error in item {i}: {str(e)[:100]}...") continue print(f"\n💾 Saving {count} metadata...") # Save complete metadata to JSON metadata_file = os.path.join(metadata_dir, "gbif_metadata.json") with open(metadata_file, "w", encoding="utf-8") as f: json.dump(metadata_list, f, indent=2, ensure_ascii=False) # Create summary CSV if metadata_list: create_summary_csv(metadata_list, metadata_dir) print(f"Full process completed:") print(f" • Metadata collected: {count}") print(f" • Full JSON file: {metadata_file}") print(f" • Summary CSV file: {os.path.join(metadata_dir, 'gbif_summary.csv')}") return count > 0 except Exception as e: print(f"Error: {e}") return False def create_summary_csv(metadata_list, output_dir): """ Creates a CSV file with a summary of the most important metadata. """ try: # Extract key information from each metadata summary_data = [] for item in metadata_list: summary_item = { 'gbif_id': item.get('gbifID', ''), 'species': item.get('species', ''), 'genus': item.get('genus', ''), 'family': item.get('family', ''), 'country': item.get('country', ''), 'locality': item.get('locality', ''), 'latitude': item.get('decimalLatitude', ''), 'longitude': item.get('decimalLongitude', ''), 'date': item.get('eventDate', ''), 'collector': item.get('recordedBy', ''), 'institution': item.get('institutionCode', ''), 'catalog_number': item.get('catalogNumber', ''), 'license': item.get('license', ''), 'image_url': item.get('identifier', ''), 'basis_of_record': item.get('basisOfRecord', '') } summary_data.append(summary_item) # Create DataFrame and save CSV df = pd.DataFrame(summary_data) csv_file = os.path.join(output_dir, 'gbif_summary.csv') df.to_csv(csv_file, index=False, encoding='utf-8') print(f"CSV created with {len(summary_data)} records") except Exception as e: print(f"Error creating CSV: {e}") def show_metadata_preview(metadata_dir): """ Shows a preview of the collected metadata. """ try: csv_file = os.path.join(metadata_dir, 'gbif_summary.csv') if os.path.exists(csv_file): df = pd.read_csv(csv_file) print(f"\n METADATA PREVIEW:") print(f" • Total records: {len(df)}") print(f" • Unique countries: {df['country'].nunique()}") print(f" • Unique institutions: {df['institution'].nunique()}") print(f"\n Top 3 records:") print(df.head(3).to_string(index=False)) except Exception as e: print(f" Error showing preview: {e}") def main(): """ Main function of the script. """ print("Start recolecting metadata GBIF") print("=" * 50) # Executing metadata collection success = get_gbif_metadata() if success: print("\n Metadata collected successfully!") print("Check the 'metadata' folder to see the generated files") show_metadata_preview("metadata") else: print("\n The process failed.") if __name__ == "__main__": main()