Coverage for portality/view/status.py: 9%

156 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from flask import Blueprint, make_response, url_for 

2from portality import util 

3from portality.core import app 

4from portality import models 

5from portality.lib import dates 

6import json, requests, math, os, time 

7from datetime import datetime 

8 

9blueprint = Blueprint('status', __name__) 

10 

11 

12@blueprint.route('/stats') 

13@util.jsonp 

14def stats(): 

15 res = {} 

16 

17 # Get inode use 

18 try: 

19 st = os.statvfs('/') 

20 res['inode_used_pc'] = int((float(st.f_files-st.f_ffree)/st.f_files)*100) 

21 # could complete this by installing and using psutil but as disk and memory can currently  

22 # be monitored directly by DO, no current need - can change if we move from DO 

23 #res['disk_used_pc'] = int((float(st.f_blocks-st.f_bavail)/st.f_blocks)*100) 

24 #res['memory_used_pc'] = 0 

25 except: 

26 pass 

27 

28 # Test writing to filesystem 

29 ts = int(time.time()) 

30 fn = '/tmp/status_test_write_' + str(ts) + '.txt' 

31 try: 

32 f = open(fn, "w") 

33 f.write("I am a test at " + str(ts)) 

34 f.close() 

35 res['writable'] = True 

36 except: 

37 res['writable'] = False 

38 try: 

39 os.remove(fn) 

40 except: 

41 pass 

42 

43 # Retrieve the hostname 

44 try: 

45 hn = os.uname()[1] 

46 res['host'] = hn 

47 except: 

48 pass 

49 

50 # Return a JSON response 

51 resp = make_response(json.dumps(res)) 

52 resp.mimetype = "application/json" 

53 return resp 

54 

55 

56@blueprint.route('/') 

57@util.jsonp 

58def status(): 

59 res = {'stable': True, 'ping': {'apps': {}, 'indices': {}}, 'background': {'status': 'Background jobs are stable', 'info': []}, 'notes': []} 

60 

61 # to get monitoring on this, use uptime robot or similar to check that the status page  

62 # contains the 'stable': True string and the following note strings 

63 

64 app_note = 'apps reachable' 

65 app_unreachable = 0 

66 inodes_note = 'inode use on app machines below 95%' 

67 inodes_high = 0 

68 writable_note = 'app machines can write to disk' 

69 not_writable = 0 

70 #disk_note = 'disk use on app machines below 95%' 

71 #disk_high = 0 

72 #memory_note = 'memory use on app machines below 95%' 

73 #memory_high = 0 

74 es_note = 'indexes stable' 

75 es_unreachable = 0 

76 indexable_note = 'index accepts index/delete operations' 

77 cluster_note = 'cluster stable' 

78 

79 for addr in app.config.get('APP_MACHINES_INTERNAL_IPS',[]): 

80 if not addr.startswith('http'): addr = 'http://' + addr 

81 addr += url_for('.stats') 

82 r = requests.get(addr) 

83 res['ping']['apps'][addr] = r.status_code if r.status_code != 200 else r.json() 

84 try: 

85 if res['ping']['apps'][addr].get('inode_used_pc',0) >= 95: 

86 inodes_high += 1 

87 inodes_note = 'INODE GREATER THAN 95% ON ' + str(inodes_high) + ' APP MACHINES' 

88 if res['ping']['apps'][addr].get('writable',False) != True: 

89 not_writable += 1 

90 writable_note = 'WRITE FAILURE ON ' + str(not_writable) + ' APP MACHINES' 

91 #if res['ping']['apps'][addr].get('disk_used_pc',0) >= 95: 

92 # disk_high += 1 

93 # disk_note = 'DISK USE GREATER THAN 95% ON ' + disk_high + ' APP MACHINES' 

94 #if res['ping']['apps'][addr].get('memory_used_pc',0) >= 95: 

95 # memory_high += 1 

96 # memory_note = 'MEMORY USE GREATER THAN 95% ON ' + memory_high + ' APP MACHINES' 

97 except: 

98 pass 

99 if r.status_code != 200: 

100 res['stable'] = False 

101 app_unreachable += 1 

102 app_note = str(app_unreachable) + ' APPS UNREACHABLE' 

103 res['notes'].append(app_note) 

104 res['notes'].append(inodes_note) 

105 res['notes'].append(writable_note) 

106 #res['notes'].append(disk_note) 

107 #res['notes'].append(memory_note) 

108 

109 # check that all necessary ES nodes can actually be pinged from this machine 

110 for eddr in app.config['ELASTICSEARCH_HOSTS']: 

111 es_addr = f'http://{eddr["host"]}:{eddr["port"]}' 

112 try: 

113 r = requests.get(es_addr, timeout=3) 

114 res['ping']['indices'][es_addr] = r.status_code 

115 res['stable'] = r.status_code == 200 

116 

117 if r.status_code != 200: 

118 raise Exception('ES is not OK - status is {}'.format(r.status_code)) 

119 except Exception as e: 

120 res['stable'] = False 

121 es_unreachable += 1 

122 es_note = str(es_unreachable) + ' INDEXES UNREACHABLE' 

123 res['notes'].append(es_note) 

124 

125 # query ES for cluster health and nodes up (uses second ES host in config) 

126 try: 

127 es = requests.get(es_addr + '/_stats').json() 

128 res['index'] = { 'cluster': {}, 'shards': { 'total': es['_shards']['total'], 'successful': es['_shards']['successful'] }, 'indices': {} } 

129 for k, v in es['indices'].items(): 

130 res['index']['indices'][k] = { 'docs': v['primaries']['docs']['count'], 'size': int(math.ceil(v['primaries']['store']['size_in_bytes']) / 1024 / 1024) } 

131 try: 

132 ces = requests.get(es_addr + '/_cluster/health') 

133 res['index']['cluster'] = ces.json() 

134 res['stable'] = res['index']['cluster']['status'] == 'green' 

135 if res['index']['cluster']['status'] != 'green': cluster_note = 'CLUSTER UNSTABLE' 

136 except: 

137 res['stable'] = False 

138 cluster_note = 'CLUSTER UNSTABLE' 

139 except: 

140 res['stable'] = False 

141 cluster_note = 'CLUSTER UNSTABLE' 

142 res['notes'].append(cluster_note) 

143 

144 if False: # remove this False if happy to test write to the index (could be a setting) 

145 if res['stable'] and False: 

146 try: 

147 ts = str(int(time.time())) 

148 test_index = 'status_test_writable_' + ts 

149 test_type = 'test_' + ts 

150 test_id = ts 

151 rp = requests.put(es_addr + '/' + test_index + '/' + test_type + '/' + test_id, json={'hello': 'world'}) 

152 if rp.status_code != 201: 

153 indexable_note = 'NEW INDEX WRITE OPERATION FAILED TO WRITE, RETURNED ' + str(rp.status_code) 

154 else: 

155 try: 

156 rr = requests.get(es_addr + '/' + test_index + '/' + test_type + '/' + test_id).json() 

157 if rr['hello'] != 'world': 

158 indexable_note = 'INDEX READ DID NOT FIND EXPECTED VALUE IN NEW WRITTEN RECORD' 

159 try: 

160 rd = requests.delete(es_addr + '/' + test_index) 

161 if rd.status_code != 200: 

162 indexable_note = 'INDEX DELETE OF TEST INDEX DID NOT RETURNED UNEXPECTED STATUS CODE OF ' + str(rd.status_code) 

163 try: 

164 rg = requests.get(es_addr + '/' + test_index) 

165 if rg.status_code != 404: 

166 indexable_note = 'INDEX READ AFTER DELETE TEST RETURNED UNEXPECTED STATUS CODE OF ' + str(rg.status_code) 

167 except: 

168 pass 

169 except: 

170 indexable_note = 'INDEX DELETE OF TEST INDEX FAILED' 

171 except: 

172 indexable_note = 'INDEX READ OF NEW WRITTEN RECORD DID NOT SUCCEED' 

173 except: 

174 indexable_note = 'INDEX/DELETE OPERATIONS CAUSED EXCEPTION' 

175 else: 

176 indexable_note = 'INDEX/DELETE OPERATIONS NOT TESTED DUE TO SYSTEM ALREADY UNSTABLE' 

177 res['notes'].append(indexable_note) 

178 

179 # check background jobs 

180 try: 

181 # check if journal_csv, which should run at half past every hour on the main queue, has completed in the last 2 hours (which confirms main queue) 

182 qcsv = {"query": {"bool": {"must": [ 

183 {"term":{"status":"complete"}}, 

184 {"term":{"action":"journal_csv"}}, 

185 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), 7200))}}} 

186 ]}}, "size": 1, "sort": {"created_date": {"order": "desc"}}} 

187 rcsv = models.BackgroundJob.send_query(qcsv)['hits']['hits'][0]['_source'] 

188 res['background']['info'].append('journal_csv has run in the last 2 hours, confirming main queue is running') 

189 except: 

190 res['background']['status'] = 'Unstable' 

191 res['background']['info'].append('Error when trying to check background job journal_csv in the last 2 hours - could be a problem with this job or with main queue') 

192 res['stable'] = False 

193 try: 

194 # check if prune_es_backups, which should run at 9.30am every day, has completed in the last 24.5 hours (which confirms long running queue) 

195 qprune = {"query": {"bool": {"must": [ 

196 {"term": {"status": "complete"}}, 

197 {"term": {"action": "prune_es_backups"}}, 

198 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), 88200))}}} 

199 ]}}, "size": 1, "sort": {"created_date": {"order": "desc"}}} 

200 rprune = models.BackgroundJob.send_query(qprune)['hits']['hits'][0]['_source'] 

201 res['background']['info'].append('prune_es_backups has run in the last 24.5 hours, confirming long running queue is running') 

202 except: 

203 res['background']['status'] = 'Unstable' 

204 res['background']['info'].append('Error when trying to check background job prune_es_backups in the last 24 hours - could be a problem with this job or with long running queue') 

205 res['stable'] = False 

206 # try: #fixme: commented out by SE - this isn't working well, it should probably be a background task itself 

207 # # remove old jobs if there are too many - remove anything over six months and complete 

208 # old_seconds = app.config.get("STATUS_OLD_REMOVE_SECONDS", 15552000) 

209 # qbg = {"query": {"bool": {"must": [ 

210 # {"term": {"status": "complete"}}, 

211 # {"range": {"created_date": {"lte": dates.format(dates.before(datetime.utcnow(), old_seconds))}}} 

212 # ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"} 

213 # rbg = models.BackgroundJob.send_query(qbg) 

214 # for job in rbg.get('hits', {}).get('hits', []): 

215 # models.BackgroundJob.remove_by_id(job['fields']['id'][0]) 

216 # res['background']['info'].append('Removed {0} old complete background jobs'.format(rbg.get('hits', {}).get('total', 0))) 

217 # except: 

218 # res['background']['status'] = 'Unstable' 

219 # res['background']['info'].append('Error when trying to remove old background jobs') 

220 # res['stable'] = False 

221 try: 

222 # alert about errors in the last ten minutes - assuming we are going to use uptimerobot to check this every ten minutes 

223 error_seconds = app.config.get("STATUS_ERROR_CHECK_SECONDS", 600) 

224 error_ignore = app.config.get("STATUS_ERROR_IGNORE", []) # configure a list of strings that denote something to ignore 

225 error_ignore = [error_ignore] if isinstance(error_ignore, str) else error_ignore 

226 error_ignore_fields = app.config.get("STATUS_ERROR_IGNORE_FIELDS_TO_CHECK", False) # which fields to get in the query, to check for the strings provided above 

227 error_ignore_fields = [error_ignore_fields] if isinstance(error_ignore_fields, str) else error_ignore_fields 

228 error_means_unstable = app.config.get("STATUS_ERROR_MEANS_UNSTABLE", True) 

229 qer = {"query": {"bool": {"must": [ 

230 {"term": {"status": "error"}}, 

231 {"range": {"created_date": {"gte": dates.format(dates.before(datetime.utcnow(), error_seconds))}}} 

232 ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}} # this could be customised with a fields list if we only want to check certain fields for ignore types 

233 if error_ignore_fields != False: 

234 qer["fields"] = error_ignore_fields 

235 rer = models.BackgroundJob.send_query(qer) 

236 error_count = 0 

237 for job in rer.get('hits', {}).get('hits', []): 

238 countable = True 

239 jsj = json.dumps(job) 

240 for ig in error_ignore: 

241 if ig in jsj: 

242 countable = False 

243 break 

244 if countable: 

245 error_count += 1 

246 if error_count != 0: 

247 res['background']['status'] = 'Unstable' 

248 res['background']['info'].append('Background jobs are causing errors') 

249 res['stable'] = error_means_unstable 

250 emsg = 'Found {0} background jobs in error status in the last {1} seconds'.format(error_count, error_seconds) 

251 if len(error_ignore) != 0: 

252 emsg += '. Ignoring ' + ', '.join(error_ignore) + ' which reduced the error count from ' + str(rer.get('hits', {}).get('total', {}).get('value', 0)) 

253 res['background']['info'].append(emsg) 

254 except: 

255 res['background']['status'] = 'Unstable' 

256 res['background']['info'].append('Error when trying to check background jobs for errors') 

257 res['stable'] = False 

258 

259 resp = make_response(json.dumps(res)) 

260 resp.mimetype = "application/json" 

261 return resp 

262 

263#{"query": {"bool": {"must": [{"term":{"status":"complete"}}]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"}