Skip to content

Commit 0b86caf

Browse files
committed
fix failure count
1 parent 13be83b commit 0b86caf

1 file changed

Lines changed: 24 additions & 20 deletions

File tree

rpchealth.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,6 @@ async def update_health_status():
213213
health_status = initial_health_status
214214
block_number = initial_block_number
215215

216-
if health_status == 200:
217-
server_data["failure_count"][key] = 0 # reset failure count
218-
else:
219-
server_data["failure_count"][key] = server_data["failure_count"].get(key, 0) + 1
220-
health_reason = "RPC health check"
221-
222216
# Process block number if valid
223217
if block_number is not None:
224218
if key not in server_data['stale_count']:
@@ -243,19 +237,41 @@ async def update_health_status():
243237

244238
except (ConnectionRefusedError, TimeoutError, RPCError, OSError) as e:
245239
logger.error(f"Attempt {attempt + 1} failed for {rpc_address}: {e}")
246-
server_data["failure_count"][key] = server_data["failure_count"].get(key, 0) + 1
247240
if attempt == MAX_RETRIES - 1:
248241
health_status = 503
249242
health_reason = f"Failed after {MAX_RETRIES} retries"
250243
else:
251244
await asyncio.sleep(RETRY_DELAY)
252245
except Exception as e:
253246
logger.exception(f"Unexpected error checking health status of {rpc_address}: {e}")
254-
server_data["failure_count"][key] = server_data["failure_count"].get(key, 0) + 1
255247
health_status = 503
256248
health_reason = f"Unexpected error: {str(e)}"
257249
break # Exit retry loop for unexpected exceptions
258250

251+
# Final check: Block difference against other servers
252+
# KEY FIX: Only update health status for block difference if difference is significant
253+
if health_status == 200: # Only check block difference if otherwise healthy
254+
valid_blocks = [b for b in server_data.get('last_block', {}).values() if b is not None]
255+
if valid_blocks: # Only proceed if there are valid block numbers
256+
max_block = max(valid_blocks)
257+
current_server_block = server_data['last_block'].get(key) # Get current server's block
258+
259+
# KEY FIX: Changed threshold from 50 to a larger value (use at least 100)
260+
# Or, if you want to keep it at 50, ensure we're using the right comparison
261+
if current_server_block is not None and max_block - current_server_block > 50:
262+
health_status = 503 # Mark as unhealthy
263+
health_reason = f"Block difference (behind by {max_block - current_server_block})"
264+
# KEY FIX: Only increment failure count for large block differences
265+
server_data['failure_count'][key] = server_data["failure_count"].get(key, 0) + 1
266+
267+
# KEY FIX: Update failure count based on final health status
268+
if health_status == 200:
269+
server_data["failure_count"][key] = 0 # reset failure count on success
270+
else:
271+
# Only increment if it wasn't already incremented for block difference
272+
if health_reason != f"Block difference (behind by {max_block - current_server_block})":
273+
server_data['failure_count'][key] = server_data["failure_count"].get(key, 0) + 1
274+
259275
# Check for server removal due to failures
260276
if server_data["failure_count"].get(key, 0) >= REMOVE_AFTER_FAILURES:
261277
logger.info(f"Removing server {key} due to persistent failures.")
@@ -271,18 +287,6 @@ async def update_health_status():
271287
await save_server_data(server_data)
272288
continue # Skip the rest for this server
273289

274-
# Final check: Block difference against other servers
275-
if health_status == 200: # Only check block difference if otherwise healthy
276-
valid_blocks = [b for b in server_data.get('last_block', {}).values() if b is not None]
277-
if valid_blocks: # Only proceed if there are valid block numbers
278-
max_block = max(valid_blocks)
279-
current_server_block = server_data['last_block'].get(key) # Get current server's block
280-
281-
if current_server_block is not None and max_block - current_server_block > 50:
282-
health_status = 503 # Mark as unhealthy
283-
health_reason = f"Block difference (behind by {max_block - current_server_block})"
284-
server_data['failure_count'][key] = server_data["failure_count"].get(key, 0) + 1 # increment failure
285-
286290
# Now that all health checks are complete, update the status and notify if changed
287291
if health_status != old_status:
288292
server_data['health_status'][key] = health_status

0 commit comments

Comments
 (0)