Skip to content

Commit e59cd88

Browse files
author
teem0n
committed
Attempt to unify timeouts
1 parent 94ed1ba commit e59cd88

3 files changed

Lines changed: 27 additions & 31 deletions

File tree

internal/app/app.go

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func (app *App) healthChecker(ctx context.Context) {
221221

222222
// separate gorutine performing info file management
223223
func (app *App) stateFileHandler(ctx context.Context) {
224-
ticker := time.NewTicker(app.config.InfoFileHandlerInterval)
224+
ticker := time.NewTicker(app.config.MedTimeout)
225225
for {
226226
select {
227227
case <-ticker.C:
@@ -741,7 +741,7 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
741741
}
742742
}
743743

744-
managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
744+
managerElectionDelay := app.config.MedTimeout
745745

746746
if workingHANodesCount > 0 && visibleHAHostsCount <= (workingHANodesCount-1)/2 {
747747
app.logger.Infof("manager lost quorum (%d/%d visible HAHosts)", visibleHAHostsCount, workingHANodesCount)
@@ -750,13 +750,13 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
750750
if app.lostQuorumTime.IsZero() {
751751
app.lostQuorumTime = time.Now()
752752
} else {
753-
// Lost quorum less than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
753+
// Lost quorum less than 30 (default MedTimeout) seconds ago
754754
// Just wait manager recover connection
755-
if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss {
755+
if lostQuorumDuration <= managerElectionDelay {
756756
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager wait for network recovery", lostQuorumDuration.Seconds())
757-
// Lost quorum more than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
757+
// Lost quorum more than 30 (default MedTimeout) seconds ago
758758
// Manager should release lock and dont acquire lock for 45 (default ManagerElectionDelayAfterQuorumLoss) seconds
759-
} else if lostQuorumDuration > managerElectionDelayAfterQuorumLoss {
759+
} else {
760760
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager release lock", lostQuorumDuration.Seconds())
761761
app.dcs.ReleaseLock(pathManagerLock)
762762
return stateCandidate, false
@@ -775,29 +775,27 @@ func (app *App) AcquireLock(path string) bool {
775775
return app.dcs.AcquireLock(path)
776776
}
777777

778-
managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
779-
managerLockAcquireDelayAfterQuorumLoss := app.config.ManagerLockAcquireDelayAfterQuorumLoss
778+
electionDelay := app.config.MedTimeout
779+
lockAcquireDelay := electionDelay + 15*time.Second
780780

781781
lostQuorumDuration := time.Since(app.lostQuorumTime)
782-
if lostQuorumDuration < managerElectionDelayAfterQuorumLoss {
782+
if lostQuorumDuration < electionDelay {
783783
app.logger.Debug("manager try to acquire lock")
784784
return app.dcs.AcquireLock(path)
785-
} else if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss+managerLockAcquireDelayAfterQuorumLoss {
785+
} else if lostQuorumDuration <= electionDelay+lockAcquireDelay {
786786
// Manager cant AcquireLock in delay
787787
app.logger.Debugf(
788788
"Quorum loss ongoing (%0.2fs): manager lock acquisition blocked (%0.2fs/%0.2fs cooldown)",
789789
lostQuorumDuration.Seconds(),
790-
lostQuorumDuration.Seconds()-managerElectionDelayAfterQuorumLoss.Seconds(),
791-
managerLockAcquireDelayAfterQuorumLoss.Seconds(),
790+
lostQuorumDuration.Seconds()-electionDelay.Seconds(),
791+
lockAcquireDelay.Seconds(),
792792
)
793793
return false
794794
// Manager start to try to AcquireLock
795-
} else if lostQuorumDuration > app.config.ManagerElectionDelayAfterQuorumLoss+app.config.ManagerLockAcquireDelayAfterQuorumLoss {
795+
} else {
796796
app.lostQuorumTime = time.Time{}
797797
return app.dcs.AcquireLock(path)
798798
}
799-
800-
return false
801799
}
802800

803801
func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*nodestate.NodeState, activeNodes []string, master string) error {

internal/config/config.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ type MySQLConfig struct {
3434

3535
// Config contains all mysync configuration
3636
type Config struct {
37+
// We need only few timeout settings:
38+
// - Fast (read sqls, 5 sec),
39+
// - Meduim (modifying sqls, start-stop, etc, 30 sec)
40+
// - perhaps, Long (?)
41+
// Let's start with meduim...
42+
43+
// 30 sec timeout - sql, etc
44+
MedTimeout time.Duration `config:"m_timeout" yaml:"m_timeout"`
45+
3746
DevMode bool `config:"dev_mode" yaml:"dev_mode"`
3847
SemiSync bool `config:"semi_sync" yaml:"semi_sync"`
3948
SemiSyncEnableLag int64 `config:"semi_sync_enable_lag" yaml:"semi_sync_enable_lag"`
@@ -58,16 +67,10 @@ type Config struct {
5867
DcsWaitTimeout time.Duration `config:"dcs_wait_timeout" yaml:"dcs_wait_timeout"`
5968
DBTimeout time.Duration `config:"db_timeout" yaml:"db_timeout"`
6069
DBLostCheckTimeout time.Duration `config:"db_lost_check_timeout" yaml:"db_lost_check_timeout"`
61-
DBSetRoTimeout time.Duration `config:"db_set_ro_timeout" yaml:"db_set_ro_timeout"`
62-
DBSetRoForceTimeout time.Duration `config:"db_set_ro_force_timeout" yaml:"db_set_ro_force_timeout"`
63-
DBStopSlaveSQLThreadTimeout time.Duration `config:"db_stop_slave_sql_thread_timeout" yaml:"db_stop_slave_sql_thread_timeout"`
6470
TickInterval time.Duration `config:"tick_interval" yaml:"tick_interval"`
6571
HealthCheckInterval time.Duration `config:"healthcheck_interval" yaml:"healthcheck_interval"`
66-
InfoFileHandlerInterval time.Duration `config:"info_file_handler_interval" yaml:"info_file_handler_interval"`
6772
RecoveryCheckInterval time.Duration `config:"recoverycheck_interval" yaml:"recoverycheck_interval"`
6873
ExternalCAFileCheckInterval time.Duration `config:"external_ca_file_check_interval" yaml:"external_ca_file_check_interval"`
69-
ManagerElectionDelayAfterQuorumLoss time.Duration `config:"manager_election_delay_after_quorum_loss" yaml:"manager_election_delay_after_quorum_loss"`
70-
ManagerLockAcquireDelayAfterQuorumLoss time.Duration `config:"manager_lock_acquire_delay_after_quorum_loss" yaml:"manager_lock_acquire_delay_after_quorum_loss"`
7174
MaxAcceptableLag float64 `config:"max_acceptable_lag" yaml:"max_acceptable_lag"`
7275
SlaveCatchUpTimeout time.Duration `config:"slave_catch_up_timeout" yaml:"slave_catch_up_timeout"`
7376
DisableSemiSyncReplicationOnMaintenance bool `config:"disable_semi_sync_replication_on_maintenance" yaml:"disable_semi_sync_replication_on_maintenance"`
@@ -123,6 +126,7 @@ func DefaultConfig() (Config, error) {
123126
return Config{}, err
124127
}
125128
config := Config{
129+
MedTimeout: 30 * time.Second,
126130
DevMode: false,
127131
SemiSync: false,
128132
SemiSyncEnableLag: 100 * 1024 * 1024, // 100Mb
@@ -156,18 +160,12 @@ func DefaultConfig() (Config, error) {
156160
DcsWaitTimeout: 10 * time.Second,
157161
DBTimeout: 5 * time.Second,
158162
DBLostCheckTimeout: 5 * time.Second,
159-
DBSetRoTimeout: 30 * time.Second,
160-
DBSetRoForceTimeout: 30 * time.Second,
161163
DisableSetReadonlyOnLost: false,
162164
ResetupCrashedHosts: false,
163-
DBStopSlaveSQLThreadTimeout: 30 * time.Second,
164165
TickInterval: 5 * time.Second,
165166
HealthCheckInterval: 5 * time.Second,
166-
InfoFileHandlerInterval: 30 * time.Second,
167167
RecoveryCheckInterval: 5 * time.Second,
168168
ExternalCAFileCheckInterval: 5 * time.Second,
169-
ManagerElectionDelayAfterQuorumLoss: 30 * time.Second, // need more than 15 sec
170-
ManagerLockAcquireDelayAfterQuorumLoss: 45 * time.Second,
171169
MaxAcceptableLag: 60.0,
172170
SlaveCatchUpTimeout: 30 * time.Minute,
173171
DisableSemiSyncReplicationOnMaintenance: true,

internal/mysql/node.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ func (n *Node) IsReadOnly() (bool, bool, error) {
713713
// Setting server read-only may take a while
714714
// as server waits all running commits (not transactions) to be finished
715715
func (n *Node) SetReadOnly(superReadOnly bool) error {
716-
return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoTimeout)
716+
return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)
717717
}
718718

719719
func (n *Node) setReadonlyWithTimeout(superReadOnly bool, timeout time.Duration) error {
@@ -780,7 +780,7 @@ func (n *Node) SetReadOnlyWithForce(excludeUsers []string, superReadOnly bool) e
780780

781781
defer func() { quit <- true }()
782782

783-
return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoForceTimeout)
783+
return n.setReadonlyWithTimeout(superReadOnly, n.config.MedTimeout)
784784
}
785785

786786
// SetWritable sets MySQL Node to be writable, eg. disables read-only
@@ -796,7 +796,7 @@ func (n *Node) StopSlave() error {
796796
}
797797
return n.execMogrifyWithTimeout(q, map[string]any{
798798
"channel": n.config.ReplicationChannel,
799-
}, n.config.DBStopSlaveSQLThreadTimeout)
799+
}, n.config.MedTimeout)
800800
}
801801

802802
// StartSlave starts replication (both IO and SQL threads)
@@ -858,7 +858,7 @@ func (n *Node) StopSlaveSQLThread() error {
858858
}
859859
return n.execMogrifyWithTimeout(q, map[string]any{
860860
"channel": n.config.ReplicationChannel,
861-
}, n.config.DBStopSlaveSQLThreadTimeout)
861+
}, n.config.MedTimeout)
862862
}
863863

864864
// StartSlaveSQLThread starts SQL replication thread

0 commit comments

Comments
 (0)