Coverage for portality / models / data_dump.py: 85%

120 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-05 00:09 +0100

1from portality.lib.seamless import SeamlessMixin 

2from portality.dao import DomainObject 

3from portality.lib.coerce import COERCE_MAP 

4from datetime import datetime 

5from portality.lib import dates, es_data_mapping 

6from typing import Union, List 

7from portality.core import app 

8 

9 

10DATA_DUMP_STRUCT = { 

11 "fields" : { 

12 "id" : {"coerce" : "unicode"}, 

13 "created_date" : {"coerce" : "utcdatetime"}, 

14 "last_updated" : {"coerce" : "utcdatetime"}, 

15 "es_type": {"coerce": "unicode"}, 

16 "dump_date": {"coerce": "utcdatetime"}, 

17 }, 

18 "objects": [ 

19 "article", 

20 "journal" 

21 ], 

22 "structs": { 

23 "article": { 

24 "fields": { 

25 "container": {"coerce": "unicode"}, 

26 "filename": {"coerce": "unicode"}, 

27 "url": {"coerce": "unicode"}, 

28 "size": {"coerce": "integer"} 

29 } 

30 }, 

31 "journal": { 

32 "fields": { 

33 "container": {"coerce": "unicode"}, 

34 "filename": {"coerce": "unicode"}, 

35 "url": {"coerce": "unicode"}, 

36 "size": {"coerce": "integer"} 

37 } 

38 } 

39 } 

40} 

41 

42MAPPING_OPTS = { 

43 "dynamic": None, 

44 "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"] 

45} 

46 

47class DataDump(SeamlessMixin, DomainObject): 

48 __type__ = "data_dump" 

49 

50 __SEAMLESS_STRUCT__ = DATA_DUMP_STRUCT 

51 __SEAMLESS_COERCE__ = COERCE_MAP 

52 

53 def __init__(self, **kwargs): 

54 # FIXME: hack, to deal with ES integration layer being improperly abstracted 

55 if "_source" in kwargs: 

56 kwargs = kwargs["_source"] 

57 super(DataDump, self).__init__(raw=kwargs) 

58 

59 def mappings(self): 

60 return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS) 

61 

62 @property 

63 def data(self): 

64 return self.__seamless__.data 

65 

66 @classmethod 

67 def all_dumps_before(cls, cutoff: datetime) -> list: 

68 q = CutoffQuery(cutoff) 

69 return cls.object_query(q.query()) 

70 

71 @classmethod 

72 def find_by_filename(cls, filename: str) -> List['DataDump']: 

73 q = FilenameQuery(filename) 

74 return cls.object_query(q.query()) 

75 

76 @classmethod 

77 def find_latest(cls): 

78 q = LatestQuery() 

79 res = cls.object_query(q.query()) 

80 if res is not None and len(res) > 0: 

81 return res[0] 

82 return None 

83 

84 @classmethod 

85 def first_dump_after(cls, cutoff: datetime) -> Union[None, 'DataDump']: 

86 q = FirstAfterQuery(cutoff) 

87 res = cls.object_query(q.query()) 

88 if res is not None and len(res) > 0: 

89 return res[0] 

90 return None 

91 

92 @property 

93 def dump_date(self): 

94 return self.__seamless__.get_single("dump_date", coerce=COERCE_MAP["datetime"]) 

95 

96 @dump_date.setter 

97 def dump_date(self, dump_date: Union[str, datetime]): 

98 self.__seamless__.set_with_struct("dump_date", dump_date) 

99 

100 def set_article_dump(self, container, filename, size, url): 

101 self.__seamless__.set_with_struct("article", { 

102 "container": container, 

103 "filename": filename, 

104 "url": url, 

105 "size": size 

106 }) 

107 

108 def remove_article_dump(self): 

109 self.__seamless__.delete("article") 

110 

111 @property 

112 def article_container(self): 

113 return self.__seamless__.get_single("article.container") 

114 

115 @property 

116 def article_filename(self): 

117 return self.__seamless__.get_single("article.filename") 

118 

119 @property 

120 def article_url(self): 

121 return self.__seamless__.get_single("article.url") 

122 

123 @property 

124 def article_size(self): 

125 return self.__seamless__.get_single("article.size") 

126 

127 @property 

128 def article_size_human(self): 

129 value = self.article_size 

130 if value is not None: 

131 return self._int_to_filesize(value) 

132 return None 

133 

134 def set_journal_dump(self, container, filename, size, url): 

135 self.__seamless__.set_with_struct("journal", { 

136 "container": container, 

137 "filename": filename, 

138 "url": url, 

139 "size": size 

140 }) 

141 

142 def remove_journal_dump(self): 

143 self.__seamless__.delete("journal") 

144 

145 @property 

146 def journal_container(self): 

147 return self.__seamless__.get_single("journal.container") 

148 

149 @property 

150 def journal_filename(self): 

151 return self.__seamless__.get_single("journal.filename") 

152 

153 @property 

154 def journal_size(self): 

155 return self.__seamless__.get_single("journal.size") 

156 

157 @property 

158 def journal_size_human(self): 

159 value = self.journal_size 

160 if value is not None: 

161 return self._int_to_filesize(value) 

162 return None 

163 

164 @property 

165 def journal_url(self): 

166 return self.__seamless__.get_single("journal.url") 

167 

168 def _int_to_filesize(self, value): 

169 if value is not None: 

170 for unit in ['B', 'KB', 'MB', 'GB', 'TB']: 

171 if value < 1024.0: 

172 return f"{value:.2f} {unit}" 

173 value /= 1024.0 

174 return f"{value:.2f} PB" 

175 return None 

176 

177class CutoffQuery(object): 

178 def __init__(self, cutoff: datetime): 

179 self.cutoff = cutoff 

180 

181 def query(self): 

182 return { 

183 "query": { 

184 "range": { 

185 "dump_date": { 

186 "lt": dates.format(self.cutoff) 

187 } 

188 } 

189 }, 

190 "sort": { 

191 "dump_date": { 

192 "order": "asc" # oldest first 

193 } 

194 } 

195 } 

196 

197class FirstAfterQuery(object): 

198 def __init__(self, cutoff: datetime): 

199 self.cutoff = cutoff 

200 

201 def query(self): 

202 return { 

203 "query": { 

204 "range": { 

205 "dump_date": { 

206 "gte": dates.format(self.cutoff) 

207 } 

208 } 

209 }, 

210 "sort": { 

211 "dump_date": { 

212 "order": "asc" 

213 } 

214 }, 

215 "size": 1 

216 } 

217 

218class LatestQuery: 

219 def query(self): 

220 return { 

221 "query": { 

222 "match_all": {} 

223 }, 

224 "sort": { 

225 "dump_date": { 

226 "order": "desc" 

227 } 

228 }, 

229 "size": 1 

230 } 

231 

232class FilenameQuery(object): 

233 def __init__(self, filename: str): 

234 self.filename = filename 

235 

236 def query(self): 

237 return { 

238 "query": { 

239 "bool": { 

240 "should": [ 

241 { 

242 "term": { 

243 "article.filename.exact": self.filename 

244 } 

245 }, 

246 { 

247 "term": { 

248 "journal.filename.exact": self.filename 

249 } 

250 } 

251 ] 

252 } 

253 } 

254 }