Coverage for portality/tasks/harvester_helpers/epmc/epmc_harvester.py: 87%

90 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from portality.models.harvester import HarvesterPlugin 

2from portality.tasks.harvester_helpers.epmc import client, queries 

3from portality.lib import dates 

4from portality.api.current.client import models as doaj 

5from portality.core import app 

6from datetime import datetime 

7import time 

8 

9 

10class DefaultLogger(): 

11 def __init__(self): 

12 self._log = [] 

13 

14 def log(self, msg): 

15 self._log.append({ 

16 "timestamp": dates.now_with_microseconds(), 

17 "message" : msg 

18 }) 

19 

20 

21class EPMCHarvester(HarvesterPlugin): 

22 

23 def __init__(self, logger=None): 

24 self.logger = DefaultLogger() if logger is None else logger 

25 super(EPMCHarvester, self).__init__() 

26 

27 def _write_to_logger(self, msg): 

28 self.logger.log(msg) 

29 

30 def get_name(self): 

31 return "epmc" 

32 

33 def iterate(self, issn, since, to=None): 

34 # set the default value for to, if not already set 

35 if to is None: 

36 to = dates.now() 

37 

38 # get the dates into a datestamp 

39 sd = dates.parse(since) 

40 td = dates.parse(to) 

41 

42 # calculate the ranges we're going to want to query by 

43 # We're going to query epmc one day at a time, so that we can effectively 

44 # iterate through in updated date order (though within each day, there will 

45 # be no ordering, there is little we can do about that except reduce the 

46 # request granularity further, which would massively increase the number 

47 # of requests) 

48 ranges = dates.day_ranges(sd, td) 

49 throttle = app.config.get("EPMC_HARVESTER_THROTTLE") 

50 

51 last = None 

52 for fr, until in ranges: 

53 # throttle each day 

54 if last is not None and throttle is not None: 

55 diff = (datetime.utcnow() - last).total_seconds() 

56 self._write_to_logger("Last day request at {x}, {y}s ago; throttle {z}s".format(x=last, y=diff, z=throttle)) 

57 if diff < throttle: 

58 waitfor = throttle - diff 

59 self._write_to_logger( 

60 "Throttling EPMC requests for {x}s".format(x=waitfor)) 

61 time.sleep(waitfor) 

62 

63 # build the query for the oa articles in that issn for the specified day (note we don't use the range, as the granularity in EPMC means we'd double count 

64 # note that we use date_sort=True as a weak proxy for ordering by updated date (it actually orders by publication date, which may be partially the same as updated date) 

65 query = queries.oa_issn_updated(issn, fr, date_sort=True) 

66 epmc = client.EuropePMC(self.logger) 

67 for record in epmc.complex_search_iterator(query, throttle=throttle): # also throttle paging requests 

68 article = self.crosswalk(record) 

69 yield article, fr 

70 

71 last = datetime.utcnow() 

72 

73 def crosswalk(self, record): 

74 article = doaj.Article() 

75 article.bibjson = {"title" : "", "identifier" : []} 

76 bj = article.bibjson 

77 # FIXME: this is a hack; to use DataObj in this way, we need to create article.bibjson with valid data (as we have set data 

78 # requirements on the struct). So above we create it with the requried data, and now we remove the field so that it isn't 

79 # populated later when we do actual data validation. 

80 # The longer term fix for this is a review of the DataObj code, but for the time being this is good. 

81 bj._delete("title") 

82 bj.journal = {} 

83 journal = bj.journal 

84 

85 # sort out the issns - EPMC sometimes puts the same value in the issn and essn fields. I guess this is 

86 # because they regard the issn to be the essn if there is no print issn. This little trick below extracts 

87 # the values to pissn and eissn, and then if they are the same, gets rid of the pissn. 

88 pissn = record.issn 

89 eissn = record.essn 

90 if pissn == eissn: 

91 pissn = None 

92 

93 if pissn is not None: 

94 article.add_identifier("pissn", pissn) 

95 if eissn is not None: 

96 article.add_identifier("eissn", eissn) 

97 

98 bj.title = record.title 

99 article.add_identifier("doi", record.doi) 

100 journal.volume = record.journal_volume 

101 journal.number = record.journal_issue 

102 journal.title = record.journal 

103 journal.language = record.language 

104 bj.year = record.year_of_publication 

105 bj.month = record.month_of_publication 

106 journal.start_page = record.start_page 

107 journal.end_page = record.end_page 

108 article.add_link("fulltext", record.get_first_fulltext_url()) 

109 bj.abstract = record.abstract 

110 

111 for a in record.authors: 

112 cn = a.get("collectiveName") 

113 fn = a.get("firstName") 

114 ln = a.get("lastName") 

115 

116 if fn is None and ln is None and cn is None: 

117 fn = record.author_string 

118 if fn is None: 

119 continue 

120 

121 n = "" 

122 if cn is not None: 

123 n += cn 

124 if fn is not None: 

125 if n != "": 

126 n += " " 

127 n += fn 

128 if ln is not None: 

129 if n != "": 

130 n += " " 

131 n += ln 

132 article.add_author(name=n) 

133 

134 return article