Coverage for portality/tasks/harvester_helpers/epmc/epmc

1from portality.models.harvester import HarvesterPlugin

2from portality.tasks.harvester_helpers.epmc import client, queries

3from portality.lib import dates

4from portality.api.current.client import models as doaj

5from portality.core import app

6from datetime import datetime

7import time

10class DefaultLogger():

11 def __init__(self):

12 self._log = []

14 def log(self, msg):

15 self._log.append({

16 "timestamp": dates.now_with_microseconds(),

17 "message" : msg

18 })

21class EPMCHarvester(HarvesterPlugin):

23 def __init__(self, logger=None):

24 self.logger = DefaultLogger() if logger is None else logger

25 super(EPMCHarvester, self).__init__()

27 def _write_to_logger(self, msg):

28 self.logger.log(msg)

30 def get_name(self):

31 return "epmc"

33 def iterate(self, issn, since, to=None):

34 # set the default value for to, if not already set

35 if to is None:

36 to = dates.now()

38 # get the dates into a datestamp

39 sd = dates.parse(since)

40 td = dates.parse(to)

42 # calculate the ranges we're going to want to query by

43 # We're going to query epmc one day at a time, so that we can effectively

44 # iterate through in updated date order (though within each day, there will

45 # be no ordering, there is little we can do about that except reduce the

46 # request granularity further, which would massively increase the number

47 # of requests)

48 ranges = dates.day_ranges(sd, td)

49 throttle = app.config.get("EPMC_HARVESTER_THROTTLE")

51 last = None

52 for fr, until in ranges:

53 # throttle each day

54 if last is not None and throttle is not None:

55 diff = (datetime.utcnow() - last).total_seconds()

56 self._write_to_logger("Last day request at {x}, {y}s ago; throttle {z}s".format(x=last, y=diff, z=throttle))

57 if diff < throttle:

58 waitfor = throttle - diff

59 self._write_to_logger(

60 "Throttling EPMC requests for {x}s".format(x=waitfor))

61 time.sleep(waitfor)

63 # build the query for the oa articles in that issn for the specified day (note we don't use the range, as the granularity in EPMC means we'd double count

64 # note that we use date_sort=True as a weak proxy for ordering by updated date (it actually orders by publication date, which may be partially the same as updated date)

65 query = queries.oa_issn_updated(issn, fr, date_sort=True)

66 epmc = client.EuropePMC(self.logger)

67 for record in epmc.complex_search_iterator(query, throttle=throttle): # also throttle paging requests

68 article = self.crosswalk(record)

69 yield article, fr

71 last = datetime.utcnow()

73 def crosswalk(self, record):

74 article = doaj.Article()

75 article.bibjson = {"title" : "", "identifier" : []}

76 bj = article.bibjson

77 # FIXME: this is a hack; to use DataObj in this way, we need to create article.bibjson with valid data (as we have set data

78 # requirements on the struct). So above we create it with the requried data, and now we remove the field so that it isn't

79 # populated later when we do actual data validation.

80 # The longer term fix for this is a review of the DataObj code, but for the time being this is good.

81 bj._delete("title")

82 bj.journal = {}

83 journal = bj.journal

85 # sort out the issns - EPMC sometimes puts the same value in the issn and essn fields. I guess this is

86 # because they regard the issn to be the essn if there is no print issn. This little trick below extracts

87 # the values to pissn and eissn, and then if they are the same, gets rid of the pissn.

88 pissn = record.issn

89 eissn = record.essn

90 if pissn == eissn:

91 pissn = None

93 if pissn is not None:

94 article.add_identifier("pissn", pissn)

95 if eissn is not None:

96 article.add_identifier("eissn", eissn)

98 bj.title = record.title

99 article.add_identifier("doi", record.doi)

100 journal.volume = record.journal_volume

101 journal.number = record.journal_issue

102 journal.title = record.journal

103 journal.language = record.language

104 bj.year = record.year_of_publication

105 bj.month = record.month_of_publication

106 journal.start_page = record.start_page

107 journal.end_page = record.end_page

108 article.add_link("fulltext", record.get_first_fulltext_url())

109 bj.abstract = record.abstract

110

111 for a in record.authors:

112 cn = a.get("collectiveName")

113 fn = a.get("firstName")

114 ln = a.get("lastName")

115

116 if fn is None and ln is None and cn is None:

117 fn = record.author_string

118 if fn is None:

119 continue

120

121 n = ""

122 if cn is not None:

123 n += cn

124 if fn is not None:

125 if n != "":

126 n += " "

127 n += fn

128 if ln is not None:

129 if n != "":

130 n += " "

131 n += ln

132 article.add_author(name=n)

133

134 return article

Coverage for portality/tasks/harvester_helpers/epmc/epmc_harvester.py: 87%

90 statements