zenodo_harvester.py 6.96 KB
Newer Older
1
##################################################
2
## {Acesses Zenodo.org in order to supply CRIS with the location of research data and/or papers, etc. on zenodo. More than 2000 requests -> use the slower rate of 33 requests/minute}
3
##################################################
Michael Wagner's avatar
Michael Wagner committed
4
## {https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html}
5
6
##################################################
## Author: {Michael Wagner}
Michael Wagner's avatar
Michael Wagner committed
7
8
## Credits: [{}]
## License: {GNU GPL v2}
9
## Version: {0}.{1}.{0}
Michael Wagner's avatar
Michael Wagner committed
10
11
12
## Maintainer: {cris-support@fau.de}
## Email: {cris-support@fau.de}
## Status: {alpha release}
13
14
##################################################

15
# ToDo: grab by FAU tag (organization or so it was called)
Michael Wagner's avatar
Michael Wagner committed
16
# ToDo (maybe): grab all of Zenodo -> search for FAU researchers?
Michael Wagner's avatar
Michael Wagner committed
17
# ToDo: depending on the number of ORCIDs, "orcid_to_hits_dicts" might get too big
18
19

# resumption token error valid for 2min -> 422 Unprocessable Entity error
20
21
# rate limit -> 429 too many requests (60 requests per minute)
# 60 requests per minute, 2000 requests per hour
22
23
24
25

import requests
import json
import os
26
27
28
import time


29
30
31
32
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# HELPER
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# too many requests and the api will throw a 429: too many requests error

def error_handling(status_code, current_slow_down):
    """
    error codes from https://developers.zenodo.org/?python#http-status-codes
    :param status_code: html status code of the response, like 404 e.g.
    :return: prints the error code to console, returns the new slow_down modifier
    """
    error_dict = {204: 'No Content:	Request succeeded. No response included. Usually sent for DELETE requests.',
                  400: 'Bad Request:	Request failed. Error response included.',
                  401: 'Unauthorized:	Request failed, due to an invalid access token. Error response included.',
                  403: 'Forbidden:	Request failed, due to missing authorization (e.g. deleting an already submitted upload or missing scopes for your access token). Error response included.',
                  404: 'Not Found:	Request failed, due to the resource not being found. Error response included. ',
                  405: 'Method Not Allowed:	Request failed, due to unsupported HTTP method. Error response included.',
                  409: 'Conflict:	Request failed, due to the current state of the resource (e.g. edit a deopsition which is not fully integrated). Error response included. ',
                  415: 'Unsupported Media Type:	Request failed, due to missing or invalid request header Content-Type. Error response included. ',
                  429: 'Too Many Requests	Request: failed, due to rate limiting. Error response included.',
                  500: 'Internal Server Error:	Request failed, due to an internal server error. Error response NOT included. Don’t worry, Zenodo admins have been notified and will be dealing with the problem ASAP.'
                  }
    if status_code in error_dict:
        print(status_code, error_dict[status_code])
    elif status_code == 429:
        # wait a minute, try to find a better timer
56
        # ToDo: try current orcid again
57
58
59
60
61
62
63
64
        print("Sleeping a minute, setting slow_down timer to", current_slow_down + 0.5, 'from', current_slow_down)
        print("limits: 60 requests per minute, 2000 requests per hour")
        time.sleep(60)
        return current_slow_down + 0.5
    else:
        print(status_code, 'an error occurred - no error message found in the error_dict')
    return current_slow_down

65
66

def save_hits_locally(orcid, hits):
67
68
69
70
71
72
    """
    creates folders with the orcid, fills them with all search hits
    :param orcid: current orcid (identifier of a person)
    :param hits: list of zenodo.org search result hits
    :return: -
    """
73
74
75
76
77
78
79
80
81
82
83
    i = 0
    for hit in hits:
        path = './json-results/' + orcid + '/'
        if not os.path.exists(path):
            os.makedirs(path)

        with open(path + str(i) + '.json', 'w') as f:
            json.dump(hit, f)
            i += 1


Michael Wagner's avatar
Michael Wagner committed
84
85
86
87
88
89
90
91
def request_json(url, slow_down, params=None):
    """
    retrieves the zenodo response for normal requests and requests needed for people with a lot of entries
    :param url: url to request
    :param slow_down: sleep parameter for request limits
    :param params: request parameters like e.g. the orcid
    :return: dict containing the response
    """
92
93
94
95
96
97
    time.sleep(slow_down)
    response = requests.get(url, params)
    data = response.json()
    return data


98
99
100
101
102
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# CORE ZENODO API ACCESSES
# -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


103
def harvest_all():
104
105
106
    # https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite
    url = 'https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite'
    # ToDo: harvest ALL the data -> nice xml, but, yeah... slow and a bit much? eh why not?
107
    print("WIP")
108
109


110
def harvest_by_community():
111
112
113
114
115
116
    """
    https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_datacite&set=user-fau
    :return:
    """
    print("WIP")

Michael Wagner's avatar
Michael Wagner committed
117
def harvest_by_orcid(orcids, save_locally = True, slow_down=2):
118
    """
Michael Wagner's avatar
Michael Wagner committed
119
120
    uses the zenodo API to request all entries for the users given by their ORCID
    example use in __main__
121
    :param creators_orcids: array containing the orcirds of creators as strings
Michael Wagner's avatar
Michael Wagner committed
122
    :param slow_down: zendodo.org has a request limit, atm the request limit is set, so it does not break the per hour constraint
123
    :return: a dict containing lists. the key is the orcid, the content are all the results per orcid
124
    """
125
    # request limit, so sleep is needed
126

127
    # return dict with results of queries
128
129
130
    orcid_to_hits_dicts = {}

    for orcid in orcids:
131
132
        data = request_json('https://zenodo.org/api/records', slow_down, params={'q': 'creators.orcid:"' + orcid + '"'})
        if 'status' in data:
Michael Wagner's avatar
Michael Wagner committed
133
            # something went wrong, proceed to error handling
134
135
            slow_down = error_handling(data['status'], slow_down)
        elif 'hits' in data:
Michael Wagner's avatar
Michael Wagner committed
136
            # entry successfully found, harvest data, check if next entry page exists
137
138
139
140
            orcid_hits = data['hits']['hits']
            # bigger than 10 -> grab the next 10
            if 'links' in data:
                while 'next' in data['links']:
Michael Wagner's avatar
Michael Wagner committed
141
                    print(orcid, 'requesting more...')
142
143
144
                    next = data['links']['next']
                    data = request_json(next, slow_down)
                    orcid_hits += data['hits']['hits']
145

146
            # saves hits to nested folder structure
Michael Wagner's avatar
Michael Wagner committed
147
148
            if save_locally:
                save_hits_locally(orcid, orcid_hits)
149
150
            # collect hits list into a dict, which is the functions return value
            orcid_to_hits_dicts[orcid] = orcid_hits
Michael Wagner's avatar
Michael Wagner committed
151

152
153
154
155
    return orcid_to_hits_dicts


if __name__ == '__main__':
Michael Wagner's avatar
Michael Wagner committed
156
    creators_orcids = ['0000-0001-7430-3694', '0000-0003-0555-4128', '0000-0002-8273-6059']
157
    #creators_orcids = ['0000-0003-0555-4128']
158
    hits = harvest_by_orcid(creators_orcids)
Michael Wagner's avatar
Michael Wagner committed
159
    print("Done")