Coverage for portality/view/status.py: 9%
156 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-19 17:05 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-19 17:05 +0100
1from flask import Blueprint, make_response, url_for
2from portality import util
3from portality.core import app
4from portality import models
5from portality.lib import dates
6import json, requests, math, os, time
7from datetime import datetime
9blueprint = Blueprint('status', __name__)
12@blueprint.route('/stats')
13@util.jsonp
14def stats():
15 res = {}
17 # Get inode use
18 try:
19 st = os.statvfs('/')
20 res['inode_used_pc'] = int((float(st.f_files-st.f_ffree)/st.f_files)*100)
21 # could complete this by installing and using psutil but as disk and memory can currently
22 # be monitored directly by DO, no current need - can change if we move from DO
23 #res['disk_used_pc'] = int((float(st.f_blocks-st.f_bavail)/st.f_blocks)*100)
24 #res['memory_used_pc'] = 0
25 except:
26 pass
28 # Test writing to filesystem
29 ts = int(time.time())
30 fn = '/tmp/status_test_write_' + str(ts) + '.txt'
31 try:
32 f = open(fn, "w")
33 f.write("I am a test at " + str(ts))
34 f.close()
35 res['writable'] = True
36 except:
37 res['writable'] = False
38 try:
39 os.remove(fn)
40 except:
41 pass
43 # Retrieve the hostname
44 try:
45 hn = os.uname()[1]
46 res['host'] = hn
47 except:
48 pass
50 # Return a JSON response
51 resp = make_response(json.dumps(res))
52 resp.mimetype = "application/json"
53 return resp
56@blueprint.route('/')
57@util.jsonp
58def status():
59 res = {'stable': True, 'ping': {'apps': {}, 'indices': {}}, 'background': {'status': 'Background jobs are stable', 'info': []}, 'notes': []}
61 # to get monitoring on this, use uptime robot or similar to check that the status page
62 # contains the 'stable': True string and the following note strings
64 app_note = 'apps reachable'
65 app_unreachable = 0
66 inodes_note = 'inode use on app machines below 95%'
67 inodes_high = 0
68 writable_note = 'app machines can write to disk'
69 not_writable = 0
70 #disk_note = 'disk use on app machines below 95%'
71 #disk_high = 0
72 #memory_note = 'memory use on app machines below 95%'
73 #memory_high = 0
74 es_note = 'indexes stable'
75 es_unreachable = 0
76 indexable_note = 'index accepts index/delete operations'
77 cluster_note = 'cluster stable'
79 for addr in app.config.get('APP_MACHINES_INTERNAL_IPS',[]):
80 if not addr.startswith('http'): addr = 'http://' + addr
81 addr += url_for('.stats')
82 r = requests.get(addr)
83 res['ping']['apps'][addr] = r.status_code if r.status_code != 200 else r.json()
84 try:
85 if res['ping']['apps'][addr].get('inode_used_pc',0) >= 95:
86 inodes_high += 1
87 inodes_note = 'INODE GREATER THAN 95% ON ' + str(inodes_high) + ' APP MACHINES'
88 if res['ping']['apps'][addr].get('writable',False) != True:
89 not_writable += 1
90 writable_note = 'WRITE FAILURE ON ' + str(not_writable) + ' APP MACHINES'
91 #if res['ping']['apps'][addr].get('disk_used_pc',0) >= 95:
92 # disk_high += 1
93 # disk_note = 'DISK USE GREATER THAN 95% ON ' + disk_high + ' APP MACHINES'
94 #if res['ping']['apps'][addr].get('memory_used_pc',0) >= 95:
95 # memory_high += 1
96 # memory_note = 'MEMORY USE GREATER THAN 95% ON ' + memory_high + ' APP MACHINES'
97 except:
98 pass
99 if r.status_code != 200:
100 res['stable'] = False
101 app_unreachable += 1
102 app_note = str(app_unreachable) + ' APPS UNREACHABLE'
103 res['notes'].append(app_note)
104 res['notes'].append(inodes_note)
105 res['notes'].append(writable_note)
106 #res['notes'].append(disk_note)
107 #res['notes'].append(memory_note)
109 # check that all necessary ES nodes can actually be pinged from this machine
110 for eddr in app.config['ELASTICSEARCH_HOSTS']:
111 es_addr = f'http://{eddr["host"]}:{eddr["port"]}'
112 try:
113 r = requests.get(es_addr, timeout=3)
114 res['ping']['indices'][es_addr] = r.status_code
115 res['stable'] = r.status_code == 200
117 if r.status_code != 200:
118 raise Exception('ES is not OK - status is {}'.format(r.status_code))
119 except Exception as e:
120 res['stable'] = False
121 es_unreachable += 1
122 es_note = str(es_unreachable) + ' INDEXES UNREACHABLE'
123 res['notes'].append(es_note)
125 # query ES for cluster health and nodes up (uses second ES host in config)
126 try:
127 es = requests.get(es_addr + '/_stats').json()
128 res['index'] = { 'cluster': {}, 'shards': { 'total': es['_shards']['total'], 'successful': es['_shards']['successful'] }, 'indices': {} }
129 for k, v in es['indices'].items():
130 res['index']['indices'][k] = { 'docs': v['primaries']['docs']['count'], 'size': int(math.ceil(v['primaries']['store']['size_in_bytes']) / 1024 / 1024) }
131 try:
132 ces = requests.get(es_addr + '/_cluster/health')
133 res['index']['cluster'] = ces.json()
134 res['stable'] = res['index']['cluster']['status'] == 'green'
135 if res['index']['cluster']['status'] != 'green': cluster_note = 'CLUSTER UNSTABLE'
136 except:
137 res['stable'] = False
138 cluster_note = 'CLUSTER UNSTABLE'
139 except:
140 res['stable'] = False
141 cluster_note = 'CLUSTER UNSTABLE'
142 res['notes'].append(cluster_note)
144 if False: # remove this False if happy to test write to the index (could be a setting)
145 if res['stable'] and False:
146 try:
147 ts = str(int(time.time()))
148 test_index = 'status_test_writable_' + ts
149 test_type = 'test_' + ts
150 test_id = ts
151 rp = requests.put(es_addr + '/' + test_index + '/' + test_type + '/' + test_id, json={'hello': 'world'})
152 if rp.status_code != 201:
153 indexable_note = 'NEW INDEX WRITE OPERATION FAILED TO WRITE, RETURNED ' + str(rp.status_code)
154 else:
155 try:
156 rr = requests.get(es_addr + '/' + test_index + '/' + test_type + '/' + test_id).json()
157 if rr['hello'] != 'world':
158 indexable_note = 'INDEX READ DID NOT FIND EXPECTED VALUE IN NEW WRITTEN RECORD'
159 try:
160 rd = requests.delete(es_addr + '/' + test_index)
161 if rd.status_code != 200:
162 indexable_note = 'INDEX DELETE OF TEST INDEX DID NOT RETURNED UNEXPECTED STATUS CODE OF ' + str(rd.status_code)
163 try:
164 rg = requests.get(es_addr + '/' + test_index)
165 if rg.status_code != 404:
166 indexable_note = 'INDEX READ AFTER DELETE TEST RETURNED UNEXPECTED STATUS CODE OF ' + str(rg.status_code)
167 except:
168 pass
169 except:
170 indexable_note = 'INDEX DELETE OF TEST INDEX FAILED'
171 except:
172 indexable_note = 'INDEX READ OF NEW WRITTEN RECORD DID NOT SUCCEED'
173 except:
174 indexable_note = 'INDEX/DELETE OPERATIONS CAUSED EXCEPTION'
175 else:
176 indexable_note = 'INDEX/DELETE OPERATIONS NOT TESTED DUE TO SYSTEM ALREADY UNSTABLE'
177 res['notes'].append(indexable_note)
179 # check background jobs
180 try:
181 # check if journal_csv, which should run at half past every hour on the main queue, has completed in the last 2 hours (which confirms main queue)
182 qcsv = {"query": {"bool": {"must": [
183 {"term":{"status":"complete"}},
184 {"term":{"action":"journal_csv"}},
185 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), 7200))}}}
186 ]}}, "size": 1, "sort": {"created_date": {"order": "desc"}}}
187 rcsv = models.BackgroundJob.send_query(qcsv)['hits']['hits'][0]['_source']
188 res['background']['info'].append('journal_csv has run in the last 2 hours, confirming main queue is running')
189 except:
190 res['background']['status'] = 'Unstable'
191 res['background']['info'].append('Error when trying to check background job journal_csv in the last 2 hours - could be a problem with this job or with main queue')
192 res['stable'] = False
193 try:
194 # check if prune_es_backups, which should run at 9.30am every day, has completed in the last 24.5 hours (which confirms long running queue)
195 qprune = {"query": {"bool": {"must": [
196 {"term": {"status": "complete"}},
197 {"term": {"action": "prune_es_backups"}},
198 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), 88200))}}}
199 ]}}, "size": 1, "sort": {"created_date": {"order": "desc"}}}
200 rprune = models.BackgroundJob.send_query(qprune)['hits']['hits'][0]['_source']
201 res['background']['info'].append('prune_es_backups has run in the last 24.5 hours, confirming long running queue is running')
202 except:
203 res['background']['status'] = 'Unstable'
204 res['background']['info'].append('Error when trying to check background job prune_es_backups in the last 24 hours - could be a problem with this job or with long running queue')
205 res['stable'] = False
206 # try: #fixme: commented out by SE - this isn't working well, it should probably be a background task itself
207 # # remove old jobs if there are too many - remove anything over six months and complete
208 # old_seconds = app.config.get("STATUS_OLD_REMOVE_SECONDS", 15552000)
209 # qbg = {"query": {"bool": {"must": [
210 # {"term": {"status": "complete"}},
211 # {"range": {"created_date": {"lte": dates.format(dates.before(datetime.utcnow(), old_seconds))}}}
212 # ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"}
213 # rbg = models.BackgroundJob.send_query(qbg)
214 # for job in rbg.get('hits', {}).get('hits', []):
215 # models.BackgroundJob.remove_by_id(job['fields']['id'][0])
216 # res['background']['info'].append('Removed {0} old complete background jobs'.format(rbg.get('hits', {}).get('total', 0)))
217 # except:
218 # res['background']['status'] = 'Unstable'
219 # res['background']['info'].append('Error when trying to remove old background jobs')
220 # res['stable'] = False
221 try:
222 # alert about errors in the last ten minutes - assuming we are going to use uptimerobot to check this every ten minutes
223 error_seconds = app.config.get("STATUS_ERROR_CHECK_SECONDS", 600)
224 error_ignore = app.config.get("STATUS_ERROR_IGNORE", []) # configure a list of strings that denote something to ignore
225 error_ignore = [error_ignore] if isinstance(error_ignore, str) else error_ignore
226 error_ignore_fields = app.config.get("STATUS_ERROR_IGNORE_FIELDS_TO_CHECK", False) # which fields to get in the query, to check for the strings provided above
227 error_ignore_fields = [error_ignore_fields] if isinstance(error_ignore_fields, str) else error_ignore_fields
228 error_means_unstable = app.config.get("STATUS_ERROR_MEANS_UNSTABLE", True)
229 qer = {"query": {"bool": {"must": [
230 {"term": {"status": "error"}},
231 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), error_seconds))}}}
232 ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}} # this could be customised with a fields list if we only want to check certain fields for ignore types
233 if error_ignore_fields != False:
234 qer["fields"] = error_ignore_fields
235 rer = models.BackgroundJob.send_query(qer)
236 error_count = 0
237 for job in rer.get('hits', {}).get('hits', []):
238 countable = True
239 jsj = json.dumps(job)
240 for ig in error_ignore:
241 if ig in jsj:
242 countable = False
243 break
244 if countable:
245 error_count += 1
246 if error_count != 0:
247 res['background']['status'] = 'Unstable'
248 res['background']['info'].append('Background jobs are causing errors')
249 res['stable'] = error_means_unstable
250 emsg = 'Found {0} background jobs in error status in the last {1} seconds'.format(error_count, error_seconds)
251 if len(error_ignore) != 0:
252 emsg += '. Ignoring ' + ', '.join(error_ignore) + ' which reduced the error count from ' + str(rer.get('hits', {}).get('total', {}).get('value', 0))
253 res['background']['info'].append(emsg)
254 except:
255 res['background']['status'] = 'Unstable'
256 res['background']['info'].append('Error when trying to check background jobs for errors')
257 res['stable'] = False
259 resp = make_response(json.dumps(res))
260 resp.mimetype = "application/json"
261 return resp
263#{"query": {"bool": {"must": [{"term":{"status":"complete"}}]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"}