Coverage for portality / bll / services / site.py: 91%

178 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-04 09:41 +0100

1import re 

2import os 

3from datetime import datetime 

4 

5from portality import models 

6from portality.bll import exceptions 

7from portality.core import app 

8from portality.lib import nav, dates 

9from portality.lib.argvalidate import argvalidate 

10from portality.lib.dates import FMT_DATETIME_SHORT, FMT_DATETIME_STD 

11from portality.models import Cache, JournalArticle 

12from portality.store import StoreFactory, prune_container 

13from portality.util import get_full_url_safe 

14from collections.abc import Iterable 

15 

16NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}" 

17IN_DOAJ = { 

18 "query": { 

19 "bool": { 

20 "must": [ 

21 {"term": {"admin.in_doaj": True}} 

22 ] 

23 } 

24 } 

25} 

26NMSP = "http://www.sitemaps.org/schemas/sitemap/0.9" 

27MAX_FILE_SIZE = (49 * 1024 * 1024) 

28MAX_URL_COUNT = 49000 

29 

30class ChunkedURLListFileGenerator(Iterable): 

31 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id, max_file_size=MAX_FILE_SIZE, max_url_count=MAX_URL_COUNT): 

32 self.file_idx = 0 

33 self.url_count = 0 

34 self.current_file_path = None 

35 self.current_filename = None 

36 self.file = None 

37 self.max_file_size = max_file_size 

38 self.max_url_count = max_url_count 

39 self.directory = directory 

40 self.filename_prefix = filename_prefix 

41 self.temp_store = temp_store 

42 self.main_store = main_store 

43 self.container_id = container_id 

44 self.files = [] 

45 

46 self.create_file() 

47 

48 def add_url(self, url, lastmod=None): 

49 self.write_url_element(url, lastmod=lastmod) 

50 self.check_and_finalize_file() 

51 self.url_count += 1 

52 

53 def create_file(self): 

54 self.current_filename = os.path.join(self.directory, f'{self.filename_prefix}_{self.file_idx}_utf8.xml') 

55 self.current_file_path = os.path.join(self.temp_store, self.current_filename) 

56 self.file = open(self.current_file_path, "w") 

57 self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n') 

58 self.write_list_start_tag() 

59 self.file_idx += 1 

60 

61 def check_and_finalize_file(self): 

62 file_size = os.path.getsize(self.current_file_path) 

63 if file_size >= self.max_file_size or self.url_count >= self.max_url_count: 

64 self.finalize_file() 

65 self.create_file() 

66 self.url_count = 0 

67 

68 def finalize_file(self): 

69 self.write_list_end_tag() 

70 self.file.close() 

71 self.main_store.store(self.container_id, self.current_filename, source_path=self.current_file_path) 

72 self.files.append(self.main_store.url(self.container_id, self.current_filename)) 

73 

74 def get_url_count(self): 

75 return self.url_count 

76 

77 def get_files(self): 

78 return self.files 

79 

80 def __iter__(self): 

81 return iter(self.files) 

82 

83 ########################################### 

84 ## functions to be implemented by subclasses 

85 

86 def write_url_element(self, loc, lastmod=None): 

87 raise NotImplementedError("Subclasses must implement write_url_element") 

88 

89 def write_list_start_tag(self): 

90 raise NotImplementedError("Subclasses must implement write_list_start_tag") 

91 

92 def write_list_end_tag(self): 

93 raise NotImplementedError("Subclasses must implement write_list_end_tag") 

94 

95 

96class SitemapGenerator(ChunkedURLListFileGenerator): 

97 

98 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id): 

99 super(SitemapGenerator, self).__init__(directory, filename_prefix, temp_store, main_store, container_id) 

100 self.change_freq = app.config.get("TOC_CHANGEFREQ", "monthly") 

101 

102 def write_url_element(self, loc, lastmod=None): 

103 url_ele = f""" 

104 <url> 

105 <loc>{loc}</loc> 

106 <changefreq>{self.change_freq}</changefreq>""" 

107 if lastmod is not None: 

108 url_ele += f"\n <lastmod>{lastmod}</lastmod>" 

109 url_ele += "\n </url>" 

110 self.file.write(url_ele) 

111 

112 def write_list_start_tag(self): 

113 self.file.write('<urlset xmlns="'+NMSP+'">') 

114 

115 def write_list_end_tag(self): 

116 self.file.write('\n</urlset>\n') 

117 

118 

119class SitemapIndexGenerator(ChunkedURLListFileGenerator): 

120 

121 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id): 

122 max_entries = app.config.get("SITEMAP_INDEX_MAX_ENTRIES", 50000) 

123 super(SitemapIndexGenerator, self).__init__(directory, filename_prefix, temp_store, main_store, container_id, max_url_count=max_entries) 

124 

125 def write_url_element(self, loc, lastmod=None): 

126 self.file.write(f" <sitemap>\n") 

127 self.file.write(f" <loc>{loc}</loc>\n") 

128 if lastmod is not None: 

129 self.file.write(f" <lastmod>{lastmod}</lastmod>\n") 

130 self.file.write(f" </sitemap>\n") 

131 

132 def write_list_start_tag(self): 

133 self.file.write('<sitemapindex xmlns="' + NMSP + '">\n') 

134 

135 def write_list_end_tag(self): 

136 self.file.write('\n</sitemapindex>\n') 

137 

138 

139class SiteService(object): 

140 

141 @staticmethod 

142 def sitemap(prune: bool = True): 

143 """ 

144 Generate the sitemap 

145 ~~Sitemap:Feature~~ 

146 :return: 

147 """ 

148 argvalidate("csv", [ 

149 {"arg": prune, "allow_none": False, "arg_name": "prune"} 

150 ], exceptions.ArgumentException) 

151 

152 action_register = [] 

153 base_url = app.config.get("BASE_URL") 

154 if not base_url.endswith("/"): 

155 base_url += "/" 

156 

157 run_start_time = dates.now_str(FMT_DATETIME_SHORT) 

158 directory = 'sitemap_doaj_' + run_start_time 

159 filename_prefix = "sitemap" 

160 container_id = app.config.get("STORE_CACHE_CONTAINER") 

161 

162 total_static_pages = 0 

163 total_journals_count = 0 

164 total_articles_count = 0 

165 

166 # ~~->FileStore:Feature~~ 

167 tmpStore = StoreFactory.tmp() 

168 mainStore = StoreFactory.get("cache") 

169 

170 # temporary directory 

171 tmp_store_dir = tmpStore.path(container_id, '', create_container=True) 

172 # Create the directories if they don't exist 

173 os.makedirs(os.path.join(tmp_store_dir,directory) , exist_ok=True) 

174 

175 sitemap_generator = SitemapGenerator(directory, filename_prefix, tmp_store_dir, mainStore, container_id) 

176 

177 # Generating URLs for static pages 

178 _entries = nav.get_nav_entries() 

179 _routes = nav.yield_all_route(_entries) 

180 _urls = (get_full_url_safe(r) for r in _routes) 

181 _urls = filter(None, _urls) 

182 _urls = set(_urls) 

183 _urls = sorted(_urls) 

184 

185 # static pages 

186 for u in _urls: 

187 sitemap_generator.add_url(u) 

188 total_static_pages += 1 

189 

190 # Generating URLs for journals and articles 

191 for j in models.Journal.all_in_doaj(): 

192 toc_loc = base_url + "toc/" + j.toc_id 

193 sitemap_generator.add_url(toc_loc, lastmod=j.last_updated) 

194 toc_art_loc = base_url + "toc/" + j.toc_id + "/articles" 

195 sitemap_generator.add_url(toc_art_loc) 

196 total_journals_count += 1 

197 

198 # Generating URLs for articles 

199 for a in models.Article.iterate(q=IN_DOAJ, keepalive='5m'): 

200 article_loc = base_url + "article/" + a.id 

201 sitemap_generator.add_url(article_loc, lastmod=a.last_updated) 

202 total_articles_count += 1 

203 

204 # check last sitemap 

205 if sitemap_generator.get_url_count() > 0: 

206 sitemap_generator.finalize_file() 

207 

208 # Create sitemap index file(s) 

209 sitemap_files = sitemap_generator.get_files() 

210 lastmod_date = dates.now_str(FMT_DATETIME_STD) 

211 

212 sitemap_prefix = "sitemap_index" 

213 index_generator = SitemapIndexGenerator(directory, sitemap_prefix, tmp_store_dir, mainStore, container_id) 

214 

215 for i, sitemap_file in enumerate(sitemap_files): 

216 public_url = f"{base_url}sitemap{i}.xml" 

217 index_generator.add_url(public_url, lastmod=lastmod_date) 

218 models.Cache.cache_nth_sitemap(i, sitemap_file) 

219 

220 # check the last index 

221 if index_generator.get_url_count() > 0: 

222 index_generator.finalize_file() 

223 

224 index_files = index_generator.get_files() 

225 models.Cache.cache_sitemap_indexes(index_files) 

226 

227 # Delete any additional maps from previous cache. Usually this may not be the situation but check 

228 # Count up any additional cached sitemaps we find and delete them. 

229 next_sitemap_ix = len(sitemap_files) 

230 while True: 

231 cache = models.Cache.pull("sitemap" + str(next_sitemap_ix)) 

232 if cache: 

233 cache.delete() 

234 else: 

235 break 

236 next_sitemap_ix += 1 

237 

238 # Prune old sitemap files if required 

239 if prune: 

240 def sort(filelist): 

241 rx = r"^sitemap_doaj_(\d{8})_(\d{4})" 

242 

243 matched_dates = [ 

244 (filename, datetime.strptime(match.groups()[0]+"_"+match.groups()[1], FMT_DATETIME_SHORT)) 

245 for filename in filelist 

246 if (match := re.match(rx, filename)) 

247 ] 

248 return [x for x, _ in sorted(matched_dates, key=lambda x: x[1], reverse=True)] 

249 

250 def _filter(filename): 

251 return filename.startswith("sitemap_") 

252 

253 action_register += prune_container(mainStore, container_id, sort, filter=_filter, keep=2, is_directory=True) 

254 action_register += prune_container(tmpStore, container_id, sort, filter=_filter, keep=2) 

255 

256 action_register.append(f"Static pages count : {total_static_pages}") 

257 action_register.append(f"Journal URLs count : {total_journals_count}") 

258 action_register.append(f"Article URLs count : {total_articles_count}") 

259 

260 return index_files, action_register 

261 

262 def site_statistics(self): 

263 """ 

264 Get the site statistics 

265 ~~SiteStatistics:Feature~~ 

266 :return: 

267 """ 

268 # First check the cache, if it's there (stale or not), return it. Otherwise, calculate and return. 

269 stats = Cache.get_site_statistics() 

270 if stats is not None: 

271 return stats 

272 

273 return JournalArticle.site_statistics() 

274 

275 def cache_site_statistics(self): 

276 """ 

277 Cache the site statistics 

278 ~~SiteStatisticsCache:Feature~~ 

279 :return: 

280 """ 

281 stats = JournalArticle.site_statistics() 

282 Cache.cache_site_statistics(stats) 

283 return stats