163 lines
5.1 KiB
Python
163 lines
5.1 KiB
Python
"""
|
|
Function to obtain the metadata from GBIF using the gbif_dl library
|
|
Important: This script requires an internet connection.
|
|
|
|
Specie: Corylus L. (hazelnut)
|
|
taxonKey = 2875967
|
|
License: CC BY 4.0
|
|
Specie: Cynara Cardunculus L. (artichoke)
|
|
taxonKey = 3112364
|
|
License: CC BY 4.0
|
|
"""
|
|
|
|
import gbif_dl
|
|
import json
|
|
import os
|
|
import pandas as pd
|
|
|
|
def get_gbif_metadata():
|
|
"""
|
|
Obtains only the metadata from GBIF for the specified species.
|
|
Doesn't download images, only collects information.
|
|
"""
|
|
try:
|
|
print("Configuring GBIF query")
|
|
|
|
query = {
|
|
"taxonKey": [3112364], # Taxon Key for the specified specie
|
|
"license": ["CC_BY_4_0"]
|
|
}
|
|
|
|
print("Generating URLs and obtaining metadata...")
|
|
|
|
# Generate data URLs
|
|
data_gen = gbif_dl.api.generate_urls(
|
|
queries=query,
|
|
label="taxonKey",
|
|
nb_samples=8000,
|
|
)
|
|
|
|
# Create directory to save metadata
|
|
metadata_dir = "metadata"
|
|
os.makedirs(metadata_dir, exist_ok=True)
|
|
print(f" Directory '{metadata_dir}' created/verified")
|
|
|
|
metadata_list = []
|
|
count = 0
|
|
|
|
print("Collecting metadata")
|
|
|
|
# Iterate over each item ONLY to obtain metadata
|
|
for i, item in enumerate(data_gen, 1):
|
|
try:
|
|
# Just add metadata to the list (NO image download)
|
|
metadata_list.append(item)
|
|
count += 1
|
|
|
|
# Show progress every 100 items
|
|
if count % 100 == 0:
|
|
print(f"Processed {count} metadata...")
|
|
|
|
except Exception as e:
|
|
print(f" Error in item {i}: {str(e)[:100]}...")
|
|
continue
|
|
|
|
print(f"\n💾 Saving {count} metadata...")
|
|
|
|
# Save complete metadata to JSON
|
|
metadata_file = os.path.join(metadata_dir, "gbif_metadata.json")
|
|
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
json.dump(metadata_list, f, indent=2, ensure_ascii=False)
|
|
|
|
# Create summary CSV
|
|
if metadata_list:
|
|
create_summary_csv(metadata_list, metadata_dir)
|
|
|
|
print(f"Full process completed:")
|
|
print(f" • Metadata collected: {count}")
|
|
print(f" • Full JSON file: {metadata_file}")
|
|
print(f" • Summary CSV file: {os.path.join(metadata_dir, 'gbif_summary.csv')}")
|
|
|
|
return count > 0
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
return False
|
|
|
|
def create_summary_csv(metadata_list, output_dir):
|
|
"""
|
|
Creates a CSV file with a summary of the most important metadata.
|
|
"""
|
|
try:
|
|
# Extract key information from each metadata
|
|
summary_data = []
|
|
|
|
for item in metadata_list:
|
|
summary_item = {
|
|
'gbif_id': item.get('gbifID', ''),
|
|
'species': item.get('species', ''),
|
|
'genus': item.get('genus', ''),
|
|
'family': item.get('family', ''),
|
|
'country': item.get('country', ''),
|
|
'locality': item.get('locality', ''),
|
|
'latitude': item.get('decimalLatitude', ''),
|
|
'longitude': item.get('decimalLongitude', ''),
|
|
'date': item.get('eventDate', ''),
|
|
'collector': item.get('recordedBy', ''),
|
|
'institution': item.get('institutionCode', ''),
|
|
'catalog_number': item.get('catalogNumber', ''),
|
|
'license': item.get('license', ''),
|
|
'image_url': item.get('identifier', ''),
|
|
'basis_of_record': item.get('basisOfRecord', '')
|
|
}
|
|
summary_data.append(summary_item)
|
|
|
|
# Create DataFrame and save CSV
|
|
df = pd.DataFrame(summary_data)
|
|
csv_file = os.path.join(output_dir, 'gbif_summary.csv')
|
|
df.to_csv(csv_file, index=False, encoding='utf-8')
|
|
|
|
print(f"CSV created with {len(summary_data)} records")
|
|
|
|
except Exception as e:
|
|
print(f"Error creating CSV: {e}")
|
|
|
|
def show_metadata_preview(metadata_dir):
|
|
"""
|
|
Shows a preview of the collected metadata.
|
|
"""
|
|
try:
|
|
csv_file = os.path.join(metadata_dir, 'gbif_summary.csv')
|
|
if os.path.exists(csv_file):
|
|
df = pd.read_csv(csv_file)
|
|
print(f"\n METADATA PREVIEW:")
|
|
print(f" • Total records: {len(df)}")
|
|
print(f" • Unique countries: {df['country'].nunique()}")
|
|
print(f" • Unique institutions: {df['institution'].nunique()}")
|
|
print(f"\n Top 3 records:")
|
|
print(df.head(3).to_string(index=False))
|
|
|
|
except Exception as e:
|
|
print(f" Error showing preview: {e}")
|
|
|
|
def main():
|
|
"""
|
|
Main function of the script.
|
|
"""
|
|
print("Start recolecting metadata GBIF")
|
|
print("=" * 50)
|
|
|
|
# Executing metadata collection
|
|
success = get_gbif_metadata()
|
|
|
|
if success:
|
|
print("\n Metadata collected successfully!")
|
|
print("Check the 'metadata' folder to see the generated files")
|
|
|
|
show_metadata_preview("metadata")
|
|
|
|
else:
|
|
print("\n The process failed.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |