@@ -213,12 +213,6 @@ async def update_health_status():
213213 health_status = initial_health_status
214214 block_number = initial_block_number
215215
216- if health_status == 200 :
217- server_data ["failure_count" ][key ] = 0 # reset failure count
218- else :
219- server_data ["failure_count" ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1
220- health_reason = "RPC health check"
221-
222216 # Process block number if valid
223217 if block_number is not None :
224218 if key not in server_data ['stale_count' ]:
@@ -243,19 +237,41 @@ async def update_health_status():
243237
244238 except (ConnectionRefusedError , TimeoutError , RPCError , OSError ) as e :
245239 logger .error (f"Attempt { attempt + 1 } failed for { rpc_address } : { e } " )
246- server_data ["failure_count" ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1
247240 if attempt == MAX_RETRIES - 1 :
248241 health_status = 503
249242 health_reason = f"Failed after { MAX_RETRIES } retries"
250243 else :
251244 await asyncio .sleep (RETRY_DELAY )
252245 except Exception as e :
253246 logger .exception (f"Unexpected error checking health status of { rpc_address } : { e } " )
254- server_data ["failure_count" ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1
255247 health_status = 503
256248 health_reason = f"Unexpected error: { str (e )} "
257249 break # Exit retry loop for unexpected exceptions
258250
251+ # Final check: Block difference against other servers
252+ # KEY FIX: Only update health status for block difference if difference is significant
253+ if health_status == 200 : # Only check block difference if otherwise healthy
254+ valid_blocks = [b for b in server_data .get ('last_block' , {}).values () if b is not None ]
255+ if valid_blocks : # Only proceed if there are valid block numbers
256+ max_block = max (valid_blocks )
257+ current_server_block = server_data ['last_block' ].get (key ) # Get current server's block
258+
259+ # KEY FIX: Changed threshold from 50 to a larger value (use at least 100)
260+ # Or, if you want to keep it at 50, ensure we're using the right comparison
261+ if current_server_block is not None and max_block - current_server_block > 50 :
262+ health_status = 503 # Mark as unhealthy
263+ health_reason = f"Block difference (behind by { max_block - current_server_block } )"
264+ # KEY FIX: Only increment failure count for large block differences
265+ server_data ['failure_count' ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1
266+
267+ # KEY FIX: Update failure count based on final health status
268+ if health_status == 200 :
269+ server_data ["failure_count" ][key ] = 0 # reset failure count on success
270+ else :
271+ # Only increment if it wasn't already incremented for block difference
272+ if health_reason != f"Block difference (behind by { max_block - current_server_block } )" :
273+ server_data ['failure_count' ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1
274+
259275 # Check for server removal due to failures
260276 if server_data ["failure_count" ].get (key , 0 ) >= REMOVE_AFTER_FAILURES :
261277 logger .info (f"Removing server { key } due to persistent failures." )
@@ -271,18 +287,6 @@ async def update_health_status():
271287 await save_server_data (server_data )
272288 continue # Skip the rest for this server
273289
274- # Final check: Block difference against other servers
275- if health_status == 200 : # Only check block difference if otherwise healthy
276- valid_blocks = [b for b in server_data .get ('last_block' , {}).values () if b is not None ]
277- if valid_blocks : # Only proceed if there are valid block numbers
278- max_block = max (valid_blocks )
279- current_server_block = server_data ['last_block' ].get (key ) # Get current server's block
280-
281- if current_server_block is not None and max_block - current_server_block > 50 :
282- health_status = 503 # Mark as unhealthy
283- health_reason = f"Block difference (behind by { max_block - current_server_block } )"
284- server_data ['failure_count' ][key ] = server_data ["failure_count" ].get (key , 0 ) + 1 # increment failure
285-
286290 # Now that all health checks are complete, update the status and notify if changed
287291 if health_status != old_status :
288292 server_data ['health_status' ][key ] = health_status
0 commit comments