Attempt to unify timeouts

teem0n · teem0n · commit e59cd88611d1 · 2025-11-21T13:16:11.000+03:00
diff --git a/internal/app/app.go b/internal/app/app.go
@@ -221,7 +221,7 @@ func (app *App) healthChecker(ctx context.Context) {
 
 // separate gorutine performing info file management
 func (app *App) stateFileHandler(ctx context.Context) {
-	ticker := time.NewTicker(app.config.InfoFileHandlerInterval)
+	ticker := time.NewTicker(app.config.MedTimeout)
 	for {
 		select {
 		case <-ticker.C:
@@ -741,7 +741,7 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
 		}
 	}
 
-	managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
+	managerElectionDelay := app.config.MedTimeout
 
 	if workingHANodesCount > 0 && visibleHAHostsCount <= (workingHANodesCount-1)/2 {
 		app.logger.Infof("manager lost quorum (%d/%d visible HAHosts)", visibleHAHostsCount, workingHANodesCount)
@@ -750,13 +750,13 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
 		if app.lostQuorumTime.IsZero() {
 			app.lostQuorumTime = time.Now()
 		} else {
-			// Lost quorum less than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
+			// Lost quorum less than 30 (default MedTimeout) seconds ago
 			// Just wait manager recover connection
-			if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss {
+			if lostQuorumDuration <= managerElectionDelay {
 				app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager wait for network recovery", lostQuorumDuration.Seconds())
-				// Lost quorum more than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
+				// Lost quorum more than 30 (default MedTimeout) seconds ago
 				// Manager should release lock and dont acquire lock for 45 (default ManagerElectionDelayAfterQuorumLoss) seconds
-			} else if lostQuorumDuration > managerElectionDelayAfterQuorumLoss {
+			} else {
 				app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager release lock", lostQuorumDuration.Seconds())
 				app.dcs.ReleaseLock(pathManagerLock)
 				return stateCandidate, false
@@ -775,29 +775,27 @@ func (app *App) AcquireLock(path string) bool {
 		return app.dcs.AcquireLock(path)
 	}
 
-	managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
-	managerLockAcquireDelayAfterQuorumLoss := app.config.ManagerLockAcquireDelayAfterQuorumLoss
+	electionDelay := app.config.MedTimeout
+	lockAcquireDelay := electionDelay + 15*time.Second
 
 	lostQuorumDuration := time.Since(app.lostQuorumTime)
-	if lostQuorumDuration < managerElectionDelayAfterQuorumLoss {
+	if lostQuorumDuration < electionDelay {
 		app.logger.Debug("manager try to acquire lock")
 		return app.dcs.AcquireLock(path)
-	} else if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss+managerLockAcquireDelayAfterQuorumLoss {
+	} else if lostQuorumDuration <= electionDelay+lockAcquireDelay {
 		// Manager cant AcquireLock in delay
 		app.logger.Debugf(
 			"Quorum loss ongoing (%0.2fs): manager lock acquisition blocked (%0.2fs/%0.2fs cooldown)",
 			lostQuorumDuration.Seconds(),
-			lostQuorumDuration.Seconds()-managerElectionDelayAfterQuorumLoss.Seconds(),
-			managerLockAcquireDelayAfterQuorumLoss.Seconds(),
+			lostQuorumDuration.Seconds()-electionDelay.Seconds(),
+			lockAcquireDelay.Seconds(),
 		)
 		return false
 		// Manager start to try to AcquireLock
-	} else if lostQuorumDuration > app.config.ManagerElectionDelayAfterQuorumLoss+app.config.ManagerLockAcquireDelayAfterQuorumLoss {
+	} else {
 		app.lostQuorumTime = time.Time{}
 		return app.dcs.AcquireLock(path)
 	}
-
-	return false
 }
 
 func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*nodestate.NodeState, activeNodes []string, master string) error {
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -34,6 +34,15 @@ type MySQLConfig struct {
 
 // Config contains all mysync configuration
 type Config struct {
+	// We need only few timeout settings:
+	// - Fast (read sqls, 5 sec),
+	// - Meduim (modifying sqls, start-stop, etc, 30 sec)
+	// - perhaps, Long (?)
+	// Let's start with meduim...
+
+	// 30 sec timeout - sql, etc
+	MedTimeout time.Duration `config:"m_timeout" yaml:"m_timeout"`
+
 	DevMode                                 bool                         `config:"dev_mode" yaml:"dev_mode"`
 	SemiSync                                bool                         `config:"semi_sync" yaml:"semi_sync"`
 	SemiSyncEnableLag                       int64                        `config:"semi_sync_enable_lag" yaml:"semi_sync_enable_lag"`
@@ -58,16 +67,10 @@ type Config struct {
 	DcsWaitTimeout                          time.Duration                `config:"dcs_wait_timeout" yaml:"dcs_wait_timeout"`
 	DBTimeout                               time.Duration                `config:"db_timeout" yaml:"db_timeout"`
 	DBLostCheckTimeout                      time.Duration                `config:"db_lost_check_timeout" yaml:"db_lost_check_timeout"`
-	DBSetRoTimeout                          time.Duration                `config:"db_set_ro_timeout" yaml:"db_set_ro_timeout"`
-	DBSetRoForceTimeout                     time.Duration                `config:"db_set_ro_force_timeout" yaml:"db_set_ro_force_timeout"`
-	DBStopSlaveSQLThreadTimeout             time.Duration                `config:"db_stop_slave_sql_thread_timeout" yaml:"db_stop_slave_sql_thread_timeout"`
 	TickInterval                            time.Duration                `config:"tick_interval" yaml:"tick_interval"`
 	HealthCheckInterval                     time.Duration                `config:"healthcheck_interval" yaml:"healthcheck_interval"`
-	InfoFileHandlerInterval                 time.Duration                `config:"info_file_handler_interval" yaml:"info_file_handler_interval"`
 	RecoveryCheckInterval                   time.Duration                `config:"recoverycheck_interval" yaml:"recoverycheck_interval"`
 	ExternalCAFileCheckInterval             time.Duration                `config:"external_ca_file_check_interval" yaml:"external_ca_file_check_interval"`
-	ManagerElectionDelayAfterQuorumLoss     time.Duration                `config:"manager_election_delay_after_quorum_loss" yaml:"manager_election_delay_after_quorum_loss"`
-	ManagerLockAcquireDelayAfterQuorumLoss  time.Duration                `config:"manager_lock_acquire_delay_after_quorum_loss" yaml:"manager_lock_acquire_delay_after_quorum_loss"`
 	MaxAcceptableLag                        float64                      `config:"max_acceptable_lag" yaml:"max_acceptable_lag"`
 	SlaveCatchUpTimeout                     time.Duration                `config:"slave_catch_up_timeout" yaml:"slave_catch_up_timeout"`
 	DisableSemiSyncReplicationOnMaintenance bool                         `config:"disable_semi_sync_replication_on_maintenance" yaml:"disable_semi_sync_replication_on_maintenance"`
@@ -123,6 +126,7 @@ func DefaultConfig() (Config, error) {
 		return Config{}, err
 	}
 	config := Config{
+		MedTimeout:        30 * time.Second,
 		DevMode:           false,
 		SemiSync:          false,
 		SemiSyncEnableLag: 100 * 1024 * 1024, // 100Mb
@@ -156,18 +160,12 @@ func DefaultConfig() (Config, error) {
 		DcsWaitTimeout:                          10 * time.Second,
 		DBTimeout:                               5 * time.Second,
 		DBLostCheckTimeout:                      5 * time.Second,
-		DBSetRoTimeout:                          30 * time.Second,
-		DBSetRoForceTimeout:                     30 * time.Second,
 		DisableSetReadonlyOnLost:                false,
 		ResetupCrashedHosts:                     false,
-		DBStopSlaveSQLThreadTimeout:             30 * time.Second,
 		TickInterval:                            5 * time.Second,
 		HealthCheckInterval:                     5 * time.Second,
-		InfoFileHandlerInterval:                 30 * time.Second,
 		RecoveryCheckInterval:                   5 * time.Second,
 		ExternalCAFileCheckInterval:             5 * time.Second,
-		ManagerElectionDelayAfterQuorumLoss:     30 * time.Second, // need more than 15 sec
-		ManagerLockAcquireDelayAfterQuorumLoss:  45 * time.Second,
 		MaxAcceptableLag:                        60.0,
 		SlaveCatchUpTimeout:                     30 * time.Minute,
 		DisableSemiSyncReplicationOnMaintenance: true,
diff --git a/internal/mysql/node.go b/internal/mysql/node.go
@@ -713,7 +713,7 @@ func (n *Node) IsReadOnly() (bool, bool, error) {
 // Setting server read-only may take a while
 // as server waits all running commits (not transactions) to be finished
 func (n *Node) SetReadOnly(superReadOnly bool) error {
-	return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoTimeout)
+	return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)
 }
 
 func (n *Node) setReadonlyWithTimeout(superReadOnly bool, timeout time.Duration) error {
@@ -780,7 +780,7 @@ func (n *Node) SetReadOnlyWithForce(excludeUsers []string, superReadOnly bool) e
 
 	defer func() { quit <- true }()
 
-	return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoForceTimeout)
+	return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)
 }
 
 // SetWritable sets MySQL Node to be writable, eg. disables read-only
@@ -796,7 +796,7 @@ func (n *Node) StopSlave() error {
 	}
 	return n.execMogrifyWithTimeout(q, map[string]any{
 		"channel": n.config.ReplicationChannel,
-	}, n.config.DBStopSlaveSQLThreadTimeout)
+	}, n.config.MedTimeout)
 }
 
 // StartSlave starts replication (both IO and SQL threads)
@@ -858,7 +858,7 @@ func (n *Node) StopSlaveSQLThread() error {
 	}
 	return n.execMogrifyWithTimeout(q, map[string]any{
 		"channel": n.config.ReplicationChannel,
-	}, n.config.DBStopSlaveSQLThreadTimeout)
+	}, n.config.MedTimeout)
 }
 
 // StartSlaveSQLThread starts SQL replication thread

Original file line number	Diff line number	Diff line change
`@@ -713,7 +713,7 @@ func (n *Node) IsReadOnly() (bool, bool, error) {`
`713`	`713`	`// Setting server read-only may take a while`
`714`	`714`	`// as server waits all running commits (not transactions) to be finished`
`715`	`715`	`func (n *Node) SetReadOnly(superReadOnly bool) error {`
`716`		`- return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoTimeout)`
	`716`	`+ return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)`
`717`	`717`	`}`
`718`	`718`
`719`	`719`	`func (n *Node) setReadonlyWithTimeout(superReadOnly bool, timeout time.Duration) error {`
`@@ -780,7 +780,7 @@ func (n *Node) SetReadOnlyWithForce(excludeUsers []string, superReadOnly bool) e`
`780`	`780`
`781`	`781`	`defer func() { quit <- true }()`
`782`	`782`
`783`		`- return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoForceTimeout)`
	`783`	`+ return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)`
`784`	`784`	`}`
`785`	`785`
`786`	`786`	`// SetWritable sets MySQL Node to be writable, eg. disables read-only`
`@@ -796,7 +796,7 @@ func (n *Node) StopSlave() error {`
`796`	`796`	`}`
`797`	`797`	`return n.execMogrifyWithTimeout(q, map[string]any{`
`798`	`798`	`"channel": n.config.ReplicationChannel,`
`799`		`- }, n.config.DBStopSlaveSQLThreadTimeout)`
	`799`	`+ }, n.config.MedTimeout)`
`800`	`800`	`}`
`801`	`801`
`802`	`802`	`// StartSlave starts replication (both IO and SQL threads)`
`@@ -858,7 +858,7 @@ func (n *Node) StopSlaveSQLThread() error {`
`858`	`858`	`}`
`859`	`859`	`return n.execMogrifyWithTimeout(q, map[string]any{`
`860`	`860`	`"channel": n.config.ReplicationChannel,`
`861`		`- }, n.config.DBStopSlaveSQLThreadTimeout)`
	`861`	`+ }, n.config.MedTimeout)`
`862`	`862`	`}`
`863`	`863`
`864`	`864`	`// StartSlaveSQLThread starts SQL replication thread`