Coverage for portality / models / data_dump.py: 85%
120 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
1from portality.lib.seamless import SeamlessMixin
2from portality.dao import DomainObject
3from portality.lib.coerce import COERCE_MAP
4from datetime import datetime
5from portality.lib import dates, es_data_mapping
6from typing import Union, List
7from portality.core import app
10DATA_DUMP_STRUCT = {
11 "fields" : {
12 "id" : {"coerce" : "unicode"},
13 "created_date" : {"coerce" : "utcdatetime"},
14 "last_updated" : {"coerce" : "utcdatetime"},
15 "es_type": {"coerce": "unicode"},
16 "dump_date": {"coerce": "utcdatetime"},
17 },
18 "objects": [
19 "article",
20 "journal"
21 ],
22 "structs": {
23 "article": {
24 "fields": {
25 "container": {"coerce": "unicode"},
26 "filename": {"coerce": "unicode"},
27 "url": {"coerce": "unicode"},
28 "size": {"coerce": "integer"}
29 }
30 },
31 "journal": {
32 "fields": {
33 "container": {"coerce": "unicode"},
34 "filename": {"coerce": "unicode"},
35 "url": {"coerce": "unicode"},
36 "size": {"coerce": "integer"}
37 }
38 }
39 }
40}
42MAPPING_OPTS = {
43 "dynamic": None,
44 "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"]
45}
47class DataDump(SeamlessMixin, DomainObject):
48 __type__ = "data_dump"
50 __SEAMLESS_STRUCT__ = DATA_DUMP_STRUCT
51 __SEAMLESS_COERCE__ = COERCE_MAP
53 def __init__(self, **kwargs):
54 # FIXME: hack, to deal with ES integration layer being improperly abstracted
55 if "_source" in kwargs:
56 kwargs = kwargs["_source"]
57 super(DataDump, self).__init__(raw=kwargs)
59 def mappings(self):
60 return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS)
62 @property
63 def data(self):
64 return self.__seamless__.data
66 @classmethod
67 def all_dumps_before(cls, cutoff: datetime) -> list:
68 q = CutoffQuery(cutoff)
69 return cls.object_query(q.query())
71 @classmethod
72 def find_by_filename(cls, filename: str) -> List['DataDump']:
73 q = FilenameQuery(filename)
74 return cls.object_query(q.query())
76 @classmethod
77 def find_latest(cls):
78 q = LatestQuery()
79 res = cls.object_query(q.query())
80 if res is not None and len(res) > 0:
81 return res[0]
82 return None
84 @classmethod
85 def first_dump_after(cls, cutoff: datetime) -> Union[None, 'DataDump']:
86 q = FirstAfterQuery(cutoff)
87 res = cls.object_query(q.query())
88 if res is not None and len(res) > 0:
89 return res[0]
90 return None
92 @property
93 def dump_date(self):
94 return self.__seamless__.get_single("dump_date", coerce=COERCE_MAP["datetime"])
96 @dump_date.setter
97 def dump_date(self, dump_date: Union[str, datetime]):
98 self.__seamless__.set_with_struct("dump_date", dump_date)
100 def set_article_dump(self, container, filename, size, url):
101 self.__seamless__.set_with_struct("article", {
102 "container": container,
103 "filename": filename,
104 "url": url,
105 "size": size
106 })
108 def remove_article_dump(self):
109 self.__seamless__.delete("article")
111 @property
112 def article_container(self):
113 return self.__seamless__.get_single("article.container")
115 @property
116 def article_filename(self):
117 return self.__seamless__.get_single("article.filename")
119 @property
120 def article_url(self):
121 return self.__seamless__.get_single("article.url")
123 @property
124 def article_size(self):
125 return self.__seamless__.get_single("article.size")
127 @property
128 def article_size_human(self):
129 value = self.article_size
130 if value is not None:
131 return self._int_to_filesize(value)
132 return None
134 def set_journal_dump(self, container, filename, size, url):
135 self.__seamless__.set_with_struct("journal", {
136 "container": container,
137 "filename": filename,
138 "url": url,
139 "size": size
140 })
142 def remove_journal_dump(self):
143 self.__seamless__.delete("journal")
145 @property
146 def journal_container(self):
147 return self.__seamless__.get_single("journal.container")
149 @property
150 def journal_filename(self):
151 return self.__seamless__.get_single("journal.filename")
153 @property
154 def journal_size(self):
155 return self.__seamless__.get_single("journal.size")
157 @property
158 def journal_size_human(self):
159 value = self.journal_size
160 if value is not None:
161 return self._int_to_filesize(value)
162 return None
164 @property
165 def journal_url(self):
166 return self.__seamless__.get_single("journal.url")
168 def _int_to_filesize(self, value):
169 if value is not None:
170 for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
171 if value < 1024.0:
172 return f"{value:.2f} {unit}"
173 value /= 1024.0
174 return f"{value:.2f} PB"
175 return None
177class CutoffQuery(object):
178 def __init__(self, cutoff: datetime):
179 self.cutoff = cutoff
181 def query(self):
182 return {
183 "query": {
184 "range": {
185 "dump_date": {
186 "lt": dates.format(self.cutoff)
187 }
188 }
189 },
190 "sort": {
191 "dump_date": {
192 "order": "asc" # oldest first
193 }
194 }
195 }
197class FirstAfterQuery(object):
198 def __init__(self, cutoff: datetime):
199 self.cutoff = cutoff
201 def query(self):
202 return {
203 "query": {
204 "range": {
205 "dump_date": {
206 "gte": dates.format(self.cutoff)
207 }
208 }
209 },
210 "sort": {
211 "dump_date": {
212 "order": "asc"
213 }
214 },
215 "size": 1
216 }
218class LatestQuery:
219 def query(self):
220 return {
221 "query": {
222 "match_all": {}
223 },
224 "sort": {
225 "dump_date": {
226 "order": "desc"
227 }
228 },
229 "size": 1
230 }
232class FilenameQuery(object):
233 def __init__(self, filename: str):
234 self.filename = filename
236 def query(self):
237 return {
238 "query": {
239 "bool": {
240 "should": [
241 {
242 "term": {
243 "article.filename.exact": self.filename
244 }
245 },
246 {
247 "term": {
248 "journal.filename.exact": self.filename
249 }
250 }
251 ]
252 }
253 }
254 }