Phenology/Code/GBIF_download/GBIF_data.py

97 lines
2.7 KiB
Python

"""
This script downloads GBIF data using the gbif_dl library.
Important: This script requires an internet connection.
Information about the species used in the thesis:
Specie: Corylus L. (hazelnut)
taxonKey = 2875967
License: CC BY 4.0
Specie: Cynara Cardunculus L. (artichoke)
taxonKey = 3112364
License: CC BY 4.0
"""
import gbif_dl
import json
import os
def get_gbif_data():
"""
Download GBIF data for Corylus L. with CC BY 4.0 license.
"""
try:
print("Configuring GBIF query")
query = {
"taxonKey": [3112364], # Taxon Key for the specified species
"license": ["CC_BY_4_0"] # Filter only by CC BY 4.0 license
}
print("Generating download URLs")
# Generate data URLs
data_gen = gbif_dl.api.generate_urls(
queries=query,
label="taxonKey",
nb_samples=8000, # The first iterations were with 100 images, just to test
)
# Create directory CORRECTLY (without leading slash)
dataset_dir = "dataset_gbif_artichoke"
os.makedirs(dataset_dir, exist_ok=True)
print(f"Directory '{dataset_dir}' created or verified")
metadata_list = []
download_count = 0
print("Starting image download")
# Iterate over every item
for i, item in enumerate(data_gen, 1):
try:
print(f"Processing image {i}...")
metadata_list.append(item)
# Use the simplest working method
gbif_dl.dl_async.download([item], root=dataset_dir)
download_count += 1
print(f"Image {i} downloaded successfully")
except Exception as e:
print(f"Error in image {i}: {str(e)[:100]}...")
continue
# Save metadata
print("Saving metadata...")
metadata_file = os.path.join(dataset_dir, "metadata.json")
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(metadata_list, f, indent=2, ensure_ascii=False)
print(f"Process completed:")
print(f" Images downloaded: {download_count}")
print(f" Metadata saved in: {metadata_file}")
return download_count > 0
except Exception as e:
print(f"Error: {e}")
return False
def main():
"""
Main function of the script.
"""
print("STARTING GBIF DATA DOWNLOAD")
print("=" * 50)
# Execute download
success = get_gbif_data()
if success:
print("\nProcess finished, please review the results in the' folder for the downloaded images")
else:
print("\n The process failed, please check the error messages above.")
if __name__ == "__main__":
main()