Commit 16daf484 authored by Michael Wagner's avatar Michael Wagner
Browse files

delay for request limit, further requests for bigger entries aka grab next url

parent 4882d0fc
##################################################
## {Acesses Zenodo.org in order to supply CRIS with the location of research data and/or papers, etc. on zenodo}
## {Acesses Zenodo.org in order to supply CRIS with the location of research data and/or papers, etc. on zenodo. More than 2000 requests -> use the slower rate of 33 requests/minute}
##################################################
## {License_info}
##################################################
## Author: {Michael Wagner}
## Credits: [{credit_list}]
## License: {license}
## Version: {0}.{0}.{2}
## Version: {0}.{1}.{0}
## Maintainer: {cris.support@fau.de}
## Email: {michael.wm.wagner@fau.de}
## Status: {in development}
##################################################
# ToDo: does the zendodo api return ALL results? no! only 10 results per request
# ToDo: grab by FAU tag (organization or so it was called)
# resumption token error valid for 2min -> 422 Unprocessable Entity error
# rate limit -> 429 too many requests (120 requests per minute - since 2016: no rate limit)
# rate limit -> 429 too many requests (60 requests per minute)
# 60 requests per minute, 2000 requests per hour
import requests
import json
# import pandas as pd
import os
import time
# import pandas as pd
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# HELPER
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# too many requests and the api will throw a 429: too many requests error
def error_handling(status_code, current_slow_down):
"""
error codes from https://developers.zenodo.org/?python#http-status-codes
:param status_code: html status code of the response, like 404 e.g.
:return: prints the error code to console, returns the new slow_down modifier
"""
error_dict = {204: 'No Content: Request succeeded. No response included. Usually sent for DELETE requests.',
400: 'Bad Request: Request failed. Error response included.',
401: 'Unauthorized: Request failed, due to an invalid access token. Error response included.',
403: 'Forbidden: Request failed, due to missing authorization (e.g. deleting an already submitted upload or missing scopes for your access token). Error response included.',
404: 'Not Found: Request failed, due to the resource not being found. Error response included. ',
405: 'Method Not Allowed: Request failed, due to unsupported HTTP method. Error response included.',
409: 'Conflict: Request failed, due to the current state of the resource (e.g. edit a deopsition which is not fully integrated). Error response included. ',
415: 'Unsupported Media Type: Request failed, due to missing or invalid request header Content-Type. Error response included. ',
429: 'Too Many Requests Request: failed, due to rate limiting. Error response included.',
500: 'Internal Server Error: Request failed, due to an internal server error. Error response NOT included. Don’t worry, Zenodo admins have been notified and will be dealing with the problem ASAP.'
}
if status_code in error_dict:
print(status_code, error_dict[status_code])
elif status_code == 429:
# wait a minute, try to find a better timer
print("Sleeping a minute, setting slow_down timer to", current_slow_down + 0.5, 'from', current_slow_down)
print("limits: 60 requests per minute, 2000 requests per hour")
time.sleep(60)
return current_slow_down + 0.5
else:
print(status_code, 'an error occurred - no error message found in the error_dict')
return current_slow_down
def save_hits_locally(orcid, hits):
"""
creates folders with the orcid, fills them with all search hits
:param orcid: current orcid (identifier of a person)
:param hits: list of zenodo.org search result hits
:return: -
"""
i = 0
for hit in hits:
path = './json-results/' + orcid + '/'
......@@ -40,20 +82,26 @@ def save_hits_locally(orcid, hits):
i += 1
def request_json(url, slow_down, params=None ):
time.sleep(slow_down)
response = requests.get(url, params)
data = response.json()
return data
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# CORE ZENODO API ACCESSES
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
def harvest_entirity():
def harvest_all():
# https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite
url = 'https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite'
# ToDo: harvest ALL the data -> nice xml, but, yeah... slow and a bit much? eh why not?
r = requests.get(url)
print(r.status_code)
print("WIP")
def harvest_community():
def harvest_by_community():
"""
https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite&set=user-fau
:return:
......@@ -61,37 +109,55 @@ def harvest_community():
print("WIP")
def harvest_zenodo(orcids):
def harvest_by_orcid(orcids):
"""
https://developers.zenodo.org/?python#changes
https://zenodo.org/search?page=1&size=20&q=creators.orcid:%220000-0001-7430-3694%22
https://zenodo.org/oai2d
:param creators_orcids: array containing the orcirds of creators as strings
:return: ???
:return: a dict containing lists. the key is the orcid, the content are all the results per orcid
"""
# request limit, so sleep is needed
slow_down = 2
# return dict with results of queries
orcid_to_hits_dicts = {}
for orcid in orcids:
query = 'creators.orcid:"' + orcid + '"'
response = requests.get('https://zenodo.org/api/records', params={'q': query})
# ToDo: error handling
data = response.json()
# json.load(data)
# max 120 requests per minute, .01 to make sure
time.sleep(slow_down)
#query = 'creators.orcid:"' + orcid + '"'
#response = requests.get('https://zenodo.org/api/records', params={'q': query})
#data = response.json()
data = request_json('https://zenodo.org/api/records', slow_down, params={'q': 'creators.orcid:"' + orcid + '"'})
hits = data['hits']['hits']
if 'status' in data:
slow_down = error_handling(data['status'], slow_down)
elif 'hits' in data:
orcid_hits = data['hits']['hits']
# saves hits to nested folder structure
save_hits_locally(orcid, hits)
# bigger than 10 -> grab the next 10
if 'links' in data:
while 'next' in data['links']:
print('requesting more...')
next = data['links']['next']
data = request_json(next, slow_down)
orcid_hits += data['hits']['hits']
orcid_to_hits_dicts[orcid] = hits
# saves hits to nested folder structure
save_hits_locally(orcid, orcid_hits)
# collect hits list into a dict, which is the functions return value
orcid_to_hits_dicts[orcid] = orcid_hits
return orcid_to_hits_dicts
if __name__ == '__main__':
creators_orcids = ['0000-0001-7430-3694']
hits = harvest_zenodo(creators_orcids)
creators_orcids = ['0000-0001-7430-3694', '0000-0002-8824-6405', '0000-0003-2136-0788', '0000-0002-8273-6059']
creators_orcids = ['0000-0003-0555-4128']
# creators_orcids = ['0000-0002-8273-6059']
hits = harvest_by_orcid(creators_orcids)
print("wow")
# harvest_entirity()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment