Coverage for portality / bll / services / site.py: 91%
178 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
1import re
2import os
3from datetime import datetime
5from portality import models
6from portality.bll import exceptions
7from portality.core import app
8from portality.lib import nav, dates
9from portality.lib.argvalidate import argvalidate
10from portality.lib.dates import FMT_DATETIME_SHORT, FMT_DATETIME_STD
11from portality.models import Cache, JournalArticle
12from portality.store import StoreFactory, prune_container
13from portality.util import get_full_url_safe
14from collections.abc import Iterable
16NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
17IN_DOAJ = {
18 "query": {
19 "bool": {
20 "must": [
21 {"term": {"admin.in_doaj": True}}
22 ]
23 }
24 }
25}
26NMSP = "http://www.sitemaps.org/schemas/sitemap/0.9"
27MAX_FILE_SIZE = (49 * 1024 * 1024)
28MAX_URL_COUNT = 49000
30class ChunkedURLListFileGenerator(Iterable):
31 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id, max_file_size=MAX_FILE_SIZE, max_url_count=MAX_URL_COUNT):
32 self.file_idx = 0
33 self.url_count = 0
34 self.current_file_path = None
35 self.current_filename = None
36 self.file = None
37 self.max_file_size = max_file_size
38 self.max_url_count = max_url_count
39 self.directory = directory
40 self.filename_prefix = filename_prefix
41 self.temp_store = temp_store
42 self.main_store = main_store
43 self.container_id = container_id
44 self.files = []
46 self.create_file()
48 def add_url(self, url, lastmod=None):
49 self.write_url_element(url, lastmod=lastmod)
50 self.check_and_finalize_file()
51 self.url_count += 1
53 def create_file(self):
54 self.current_filename = os.path.join(self.directory, f'{self.filename_prefix}_{self.file_idx}_utf8.xml')
55 self.current_file_path = os.path.join(self.temp_store, self.current_filename)
56 self.file = open(self.current_file_path, "w")
57 self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
58 self.write_list_start_tag()
59 self.file_idx += 1
61 def check_and_finalize_file(self):
62 file_size = os.path.getsize(self.current_file_path)
63 if file_size >= self.max_file_size or self.url_count >= self.max_url_count:
64 self.finalize_file()
65 self.create_file()
66 self.url_count = 0
68 def finalize_file(self):
69 self.write_list_end_tag()
70 self.file.close()
71 self.main_store.store(self.container_id, self.current_filename, source_path=self.current_file_path)
72 self.files.append(self.main_store.url(self.container_id, self.current_filename))
74 def get_url_count(self):
75 return self.url_count
77 def get_files(self):
78 return self.files
80 def __iter__(self):
81 return iter(self.files)
83 ###########################################
84 ## functions to be implemented by subclasses
86 def write_url_element(self, loc, lastmod=None):
87 raise NotImplementedError("Subclasses must implement write_url_element")
89 def write_list_start_tag(self):
90 raise NotImplementedError("Subclasses must implement write_list_start_tag")
92 def write_list_end_tag(self):
93 raise NotImplementedError("Subclasses must implement write_list_end_tag")
96class SitemapGenerator(ChunkedURLListFileGenerator):
98 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id):
99 super(SitemapGenerator, self).__init__(directory, filename_prefix, temp_store, main_store, container_id)
100 self.change_freq = app.config.get("TOC_CHANGEFREQ", "monthly")
102 def write_url_element(self, loc, lastmod=None):
103 url_ele = f"""
104 <url>
105 <loc>{loc}</loc>
106 <changefreq>{self.change_freq}</changefreq>"""
107 if lastmod is not None:
108 url_ele += f"\n <lastmod>{lastmod}</lastmod>"
109 url_ele += "\n </url>"
110 self.file.write(url_ele)
112 def write_list_start_tag(self):
113 self.file.write('<urlset xmlns="'+NMSP+'">')
115 def write_list_end_tag(self):
116 self.file.write('\n</urlset>\n')
119class SitemapIndexGenerator(ChunkedURLListFileGenerator):
121 def __init__(self, directory, filename_prefix, temp_store, main_store, container_id):
122 max_entries = app.config.get("SITEMAP_INDEX_MAX_ENTRIES", 50000)
123 super(SitemapIndexGenerator, self).__init__(directory, filename_prefix, temp_store, main_store, container_id, max_url_count=max_entries)
125 def write_url_element(self, loc, lastmod=None):
126 self.file.write(f" <sitemap>\n")
127 self.file.write(f" <loc>{loc}</loc>\n")
128 if lastmod is not None:
129 self.file.write(f" <lastmod>{lastmod}</lastmod>\n")
130 self.file.write(f" </sitemap>\n")
132 def write_list_start_tag(self):
133 self.file.write('<sitemapindex xmlns="' + NMSP + '">\n')
135 def write_list_end_tag(self):
136 self.file.write('\n</sitemapindex>\n')
139class SiteService(object):
141 @staticmethod
142 def sitemap(prune: bool = True):
143 """
144 Generate the sitemap
145 ~~Sitemap:Feature~~
146 :return:
147 """
148 argvalidate("csv", [
149 {"arg": prune, "allow_none": False, "arg_name": "prune"}
150 ], exceptions.ArgumentException)
152 action_register = []
153 base_url = app.config.get("BASE_URL")
154 if not base_url.endswith("/"):
155 base_url += "/"
157 run_start_time = dates.now_str(FMT_DATETIME_SHORT)
158 directory = 'sitemap_doaj_' + run_start_time
159 filename_prefix = "sitemap"
160 container_id = app.config.get("STORE_CACHE_CONTAINER")
162 total_static_pages = 0
163 total_journals_count = 0
164 total_articles_count = 0
166 # ~~->FileStore:Feature~~
167 tmpStore = StoreFactory.tmp()
168 mainStore = StoreFactory.get("cache")
170 # temporary directory
171 tmp_store_dir = tmpStore.path(container_id, '', create_container=True)
172 # Create the directories if they don't exist
173 os.makedirs(os.path.join(tmp_store_dir,directory) , exist_ok=True)
175 sitemap_generator = SitemapGenerator(directory, filename_prefix, tmp_store_dir, mainStore, container_id)
177 # Generating URLs for static pages
178 _entries = nav.get_nav_entries()
179 _routes = nav.yield_all_route(_entries)
180 _urls = (get_full_url_safe(r) for r in _routes)
181 _urls = filter(None, _urls)
182 _urls = set(_urls)
183 _urls = sorted(_urls)
185 # static pages
186 for u in _urls:
187 sitemap_generator.add_url(u)
188 total_static_pages += 1
190 # Generating URLs for journals and articles
191 for j in models.Journal.all_in_doaj():
192 toc_loc = base_url + "toc/" + j.toc_id
193 sitemap_generator.add_url(toc_loc, lastmod=j.last_updated)
194 toc_art_loc = base_url + "toc/" + j.toc_id + "/articles"
195 sitemap_generator.add_url(toc_art_loc)
196 total_journals_count += 1
198 # Generating URLs for articles
199 for a in models.Article.iterate(q=IN_DOAJ, keepalive='5m'):
200 article_loc = base_url + "article/" + a.id
201 sitemap_generator.add_url(article_loc, lastmod=a.last_updated)
202 total_articles_count += 1
204 # check last sitemap
205 if sitemap_generator.get_url_count() > 0:
206 sitemap_generator.finalize_file()
208 # Create sitemap index file(s)
209 sitemap_files = sitemap_generator.get_files()
210 lastmod_date = dates.now_str(FMT_DATETIME_STD)
212 sitemap_prefix = "sitemap_index"
213 index_generator = SitemapIndexGenerator(directory, sitemap_prefix, tmp_store_dir, mainStore, container_id)
215 for i, sitemap_file in enumerate(sitemap_files):
216 public_url = f"{base_url}sitemap{i}.xml"
217 index_generator.add_url(public_url, lastmod=lastmod_date)
218 models.Cache.cache_nth_sitemap(i, sitemap_file)
220 # check the last index
221 if index_generator.get_url_count() > 0:
222 index_generator.finalize_file()
224 index_files = index_generator.get_files()
225 models.Cache.cache_sitemap_indexes(index_files)
227 # Delete any additional maps from previous cache. Usually this may not be the situation but check
228 # Count up any additional cached sitemaps we find and delete them.
229 next_sitemap_ix = len(sitemap_files)
230 while True:
231 cache = models.Cache.pull("sitemap" + str(next_sitemap_ix))
232 if cache:
233 cache.delete()
234 else:
235 break
236 next_sitemap_ix += 1
238 # Prune old sitemap files if required
239 if prune:
240 def sort(filelist):
241 rx = r"^sitemap_doaj_(\d{8})_(\d{4})"
243 matched_dates = [
244 (filename, datetime.strptime(match.groups()[0]+"_"+match.groups()[1], FMT_DATETIME_SHORT))
245 for filename in filelist
246 if (match := re.match(rx, filename))
247 ]
248 return [x for x, _ in sorted(matched_dates, key=lambda x: x[1], reverse=True)]
250 def _filter(filename):
251 return filename.startswith("sitemap_")
253 action_register += prune_container(mainStore, container_id, sort, filter=_filter, keep=2, is_directory=True)
254 action_register += prune_container(tmpStore, container_id, sort, filter=_filter, keep=2)
256 action_register.append(f"Static pages count : {total_static_pages}")
257 action_register.append(f"Journal URLs count : {total_journals_count}")
258 action_register.append(f"Article URLs count : {total_articles_count}")
260 return index_files, action_register
262 def site_statistics(self):
263 """
264 Get the site statistics
265 ~~SiteStatistics:Feature~~
266 :return:
267 """
268 # First check the cache, if it's there (stale or not), return it. Otherwise, calculate and return.
269 stats = Cache.get_site_statistics()
270 if stats is not None:
271 return stats
273 return JournalArticle.site_statistics()
275 def cache_site_statistics(self):
276 """
277 Cache the site statistics
278 ~~SiteStatisticsCache:Feature~~
279 :return:
280 """
281 stats = JournalArticle.site_statistics()
282 Cache.cache_site_statistics(stats)
283 return stats