From 57df930c43d129d4ebda2f4ec0ba097094a49c2b Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Wed, 17 Jun 2026 22:18:16 +0800 Subject: [PATCH 1/5] coordinator: persist maintainer epochs before ownership changes --- coordinator/changefeed/changefeed.go | 55 +- .../changefeed/changefeed_db_backend.go | 18 +- coordinator/changefeed/changefeed_test.go | 37 +- coordinator/changefeed/etcd_backend.go | 168 ++-- coordinator/changefeed/etcd_backend_test.go | 236 +++-- .../changefeed/mock/changefeed_db_backend.go | 30 +- coordinator/controller.go | 220 +++-- coordinator/controller_drain_test.go | 2 +- coordinator/controller_test.go | 360 +++++++- coordinator/coordinator.go | 23 +- coordinator/coordinator_test.go | 130 +++ coordinator/create_changefeed_gc_test.go | 1 + coordinator/operator/operator_add.go | 1 + coordinator/operator/operator_add_test.go | 35 +- coordinator/operator/operator_controller.go | 245 +++++- .../operator/operator_controller_test.go | 401 ++++++++- coordinator/operator/operator_move.go | 214 +++-- coordinator/operator/operator_move_test.go | 191 +++- coordinator/operator/operator_stop.go | 49 +- coordinator/operator/operator_stop_test.go | 8 +- coordinator/scheduler/balance_test.go | 10 +- coordinator/scheduler/basic_test.go | 4 +- coordinator/scheduler/drain_test.go | 12 +- heartbeatpb/heartbeat.pb.go | 817 +++++++++++++----- heartbeatpb/heartbeat.proto | 13 + pkg/common/format.go | 5 +- pkg/common/maintainer_epoch.go | 23 + pkg/pdutil/utils.go | 11 + pkg/pdutil/utils_test.go | 16 + 29 files changed, 2731 insertions(+), 604 deletions(-) create mode 100644 pkg/common/maintainer_epoch.go diff --git a/coordinator/changefeed/changefeed.go b/coordinator/changefeed/changefeed.go index 54d32e7f52..42fd6c1f35 100644 --- a/coordinator/changefeed/changefeed.go +++ b/coordinator/changefeed/changefeed.go @@ -14,7 +14,6 @@ package changefeed import ( - "encoding/json" "net/url" "sync" @@ -41,7 +40,6 @@ type Changefeed struct { nodeIDMu sync.Mutex nodeID node.ID - configBytes []byte // it's saved to the backend db lastSavedCheckpointTs *atomic.Uint64 logCoordinatorResolvedTs *atomic.Uint64 @@ -62,16 +60,9 @@ func NewChangefeed(cfID common.ChangeFeedID, log.Panic("unable to parse sink-uri", zap.String("url", info.SinkURI), zap.Error(err)) } - bytes, err := json.Marshal(info) - if err != nil { - log.Panic("unable to marshal changefeed config", - zap.Error(err)) - } - res := &Changefeed{ ID: cfID, info: atomic.NewPointer(info), - configBytes: bytes, lastSavedCheckpointTs: atomic.NewUint64(checkpointTs), logCoordinatorResolvedTs: atomic.NewUint64(checkpointTs), sinkType: getSinkType(uri.Scheme), @@ -243,10 +234,11 @@ func (c *Changefeed) GetStatusForResume() *heartbeatpb.MaintainerStatus { } clone := &heartbeatpb.MaintainerStatus{ - CheckpointTs: status.CheckpointTs, - FeedState: status.FeedState, - State: status.State, - // we don't clone the errors from status, because the old error is meaningless for the resume action, but only blocks. + CheckpointTs: status.CheckpointTs, + FeedState: status.FeedState, + State: status.State, + MaintainerEpoch: status.MaintainerEpoch, + // Old errors are meaningless for resume and can only block the resumed task. Err: []*heartbeatpb.RunningError{}, } @@ -272,21 +264,26 @@ func (c *Changefeed) GetLastSavedCheckPointTs() uint64 { } func (c *Changefeed) NewAddMaintainerMessage(server node.ID) *messaging.TargetMessage { + info := c.GetInfo() + if info == nil { + log.Panic("changefeed info is nil", zap.String("changefeedID", c.ID.String())) + } + configData, err := info.MarshalWithTruncation(false) + if err != nil { + log.Panic("unable to marshal changefeed config", zap.Error(err)) + } return messaging.NewSingleTargetMessage(server, messaging.MaintainerManagerTopic, &heartbeatpb.AddMaintainerRequest{ Id: c.ID.ToPB(), CheckpointTs: c.GetStatus().CheckpointTs, - Config: c.configBytes, + Config: []byte(configData), IsNewChangefeed: c.isNew, - KeyspaceId: c.GetKeyspaceID(), + KeyspaceId: info.KeyspaceID, + MaintainerEpoch: info.Epoch, }) } -func (c *Changefeed) NewRemoveMaintainerMessage(server node.ID, casCade, removed bool) *messaging.TargetMessage { - return RemoveMaintainerMessage(c.GetKeyspaceID(), c.ID, server, casCade, removed) -} - func (c *Changefeed) NewCheckpointTsMessage(ts uint64) *messaging.TargetMessage { return messaging.NewSingleTargetMessage(c.GetNodeID(), messaging.MaintainerManagerTopic, @@ -296,15 +293,25 @@ func (c *Changefeed) NewCheckpointTsMessage(ts uint64) *messaging.TargetMessage }) } -func RemoveMaintainerMessage(keyspaceID uint32, id common.ChangeFeedID, server node.ID, casCade bool, removed bool) *messaging.TargetMessage { +// RemoveMaintainerMessage builds the fenced remove request sent to a maintainer owner. +// The maintainer epoch identifies the owner generation that is allowed to stop. +func RemoveMaintainerMessage( + keyspaceID uint32, + id common.ChangeFeedID, + server node.ID, + casCade bool, + removed bool, + maintainerEpoch uint64, +) *messaging.TargetMessage { casCade = casCade || removed return messaging.NewSingleTargetMessage(server, messaging.MaintainerManagerTopic, &heartbeatpb.RemoveMaintainerRequest{ - Id: id.ToPB(), - Cascade: casCade, - Removed: removed, - KeyspaceId: keyspaceID, + Id: id.ToPB(), + Cascade: casCade, + Removed: removed, + KeyspaceId: keyspaceID, + MaintainerEpoch: maintainerEpoch, }) } diff --git a/coordinator/changefeed/changefeed_db_backend.go b/coordinator/changefeed/changefeed_db_backend.go index 1eacd755e3..dcfd253e23 100644 --- a/coordinator/changefeed/changefeed_db_backend.go +++ b/coordinator/changefeed/changefeed_db_backend.go @@ -20,6 +20,19 @@ import ( "github.com/pingcap/ticdc/pkg/config" ) +// EpochBumpOptions carries metadata persisted together with a changefeed epoch bump. +type EpochBumpOptions struct { + CheckpointTs uint64 + Progress config.Progress + // UpdateStatus controls whether CheckpointTs and Progress overwrite the + // persisted status read by the bump transaction. + UpdateStatus bool + State *config.FeedState + Error *config.RunningError + // UpdateError controls whether Error overwrites the persisted runtime error. + UpdateError bool +} + // Backend is the metastore for the changefeed type Backend interface { // GetAllChangefeeds returns all changefeeds from the backend db, include stopped and failed changefeeds @@ -30,14 +43,15 @@ type Backend interface { CreateChangefeed(ctx context.Context, info *config.ChangeFeedInfo) error // UpdateChangefeed updates changefeed info to db UpdateChangefeed(ctx context.Context, info *config.ChangeFeedInfo, checkpointTs uint64, progress config.Progress) error + // BumpChangefeedEpoch persists a strictly newer epoch using the latest stored + // ChangeFeedInfo. It only reads and updates stored status when UpdateStatus is set. + BumpChangefeedEpoch(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options EpochBumpOptions) (*config.ChangeFeedInfo, error) // PauseChangefeed persists the pause status to db for a changefeed PauseChangefeed(ctx context.Context, id common.ChangeFeedID) error // DeleteChangefeed removes all related info of a changefeed from db DeleteChangefeed(ctx context.Context, id common.ChangeFeedID) error // SetChangefeedProgress persists the operation progress status to db for a changefeed SetChangefeedProgress(ctx context.Context, id common.ChangeFeedID, progress config.Progress) error - // ResumeChangefeed persists the resumed status to db for a changefeed and returns the resumed info. - ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, newCheckpointTs uint64) (*config.ChangeFeedInfo, error) // UpdateChangefeedCheckpointTs persists the checkpointTs for changefeeds UpdateChangefeedCheckpointTs(ctx context.Context, checkpointTs map[common.ChangeFeedID]uint64) error } diff --git a/coordinator/changefeed/changefeed_test.go b/coordinator/changefeed/changefeed_test.go index 3fdbb1148b..dc8227e2c2 100644 --- a/coordinator/changefeed/changefeed_test.go +++ b/coordinator/changefeed/changefeed_test.go @@ -14,6 +14,7 @@ package changefeed import ( + "encoding/json" "testing" "github.com/pingcap/ticdc/heartbeatpb" @@ -227,28 +228,22 @@ func TestChangefeed_NewAddMaintainerMessage(t *testing.T) { SinkURI: "kafka://127.0.0.1:9092", State: config.StateNormal, Config: config.GetDefaultReplicaConfig(), + Epoch: 7, } + info.KeyspaceID = 123 cf := NewChangefeed(cfID, info, 100, true) server := node.ID("server-1") msg := cf.NewAddMaintainerMessage(server) require.Equal(t, server, msg.To) require.Equal(t, messaging.MaintainerManagerTopic, msg.Topic) -} - -func TestChangefeed_NewRemoveMaintainerMessage(t *testing.T) { - cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - info := &config.ChangeFeedInfo{ - SinkURI: "kafka://127.0.0.1:9092", - State: config.StateNormal, - Config: config.GetDefaultReplicaConfig(), - } - cf := NewChangefeed(cfID, info, 100, true) - - server := node.ID("server-1") - msg := cf.NewRemoveMaintainerMessage(server, true, true) - require.Equal(t, server, msg.To) - require.Equal(t, messaging.MaintainerManagerTopic, msg.Topic) + req := msg.Message[0].(*heartbeatpb.AddMaintainerRequest) + require.Equal(t, info.KeyspaceID, req.KeyspaceId) + require.Equal(t, info.Epoch, req.MaintainerEpoch) + configInfo := &config.ChangeFeedInfo{} + require.NoError(t, json.Unmarshal(req.Config, configInfo)) + require.Equal(t, info.Epoch, configInfo.Epoch) + require.Equal(t, info.SinkURI, configInfo.SinkURI) } func TestChangefeed_NewCheckpointTsMessage(t *testing.T) { @@ -269,9 +264,11 @@ func TestChangefeed_NewCheckpointTsMessage(t *testing.T) { func TestRemoveMaintainerMessage(t *testing.T) { cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) server := node.ID("server-1") - msg := RemoveMaintainerMessage(common.DefaultKeyspaceID, cfID, server, true, true) + msg := RemoveMaintainerMessage(common.DefaultKeyspaceID, cfID, server, true, true, 10) require.Equal(t, server, msg.To) require.Equal(t, messaging.MaintainerManagerTopic, msg.Topic) + req := msg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(10), req.MaintainerEpoch) } func TestChangefeedGetStatusForResume(t *testing.T) { @@ -283,9 +280,10 @@ func TestChangefeedGetStatusForResume(t *testing.T) { Name: "test-changefeed", Keyspace: "test-keyspace", }, - CheckpointTs: 789, - FeedState: "normal", - State: heartbeatpb.ComponentState_Working, + CheckpointTs: 789, + FeedState: "normal", + State: heartbeatpb.ComponentState_Working, + MaintainerEpoch: 42, Err: []*heartbeatpb.RunningError{ { Time: "2024-01-01 00:00:00", @@ -312,6 +310,7 @@ func TestChangefeedGetStatusForResume(t *testing.T) { require.Equal(t, originalStatus.CheckpointTs, clonedStatus.CheckpointTs) require.Equal(t, originalStatus.FeedState, clonedStatus.FeedState) require.Equal(t, originalStatus.State, clonedStatus.State) + require.Equal(t, originalStatus.MaintainerEpoch, clonedStatus.MaintainerEpoch) require.Equal(t, 0, len(clonedStatus.Err)) } diff --git a/coordinator/changefeed/etcd_backend.go b/coordinator/changefeed/etcd_backend.go index c6378a03a2..29064b2461 100644 --- a/coordinator/changefeed/etcd_backend.go +++ b/coordinator/changefeed/etcd_backend.go @@ -25,6 +25,7 @@ import ( "github.com/pingcap/ticdc/pkg/config" cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/etcd" + "github.com/pingcap/ticdc/pkg/pdutil" clientv3 "go.etcd.io/etcd/client/v3" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -207,6 +208,123 @@ func (b *EtcdBackend) UpdateChangefeed(ctx context.Context, info *config.ChangeF return nil } +// BumpChangefeedEpoch atomically persists a strictly newer ownership epoch. +// It can optionally update status in the same transaction so state changes and +// the new owner fence are observed together after coordinator failover. +func (b *EtcdBackend) BumpChangefeedEpoch( + ctx context.Context, + id common.ChangeFeedID, + candidateEpoch uint64, + options EpochBumpOptions, +) (*config.ChangeFeedInfo, error) { + // The epoch bump must be serialized at the persisted metadata boundary. + // Otherwise independent in-memory bumps can generate the same epoch or + // overwrite a newer epoch written by another coordinator. + const ( + bumpEpochMaxRetries = 10 + bumpEpochRetryDelay = 25 * time.Millisecond + ) + infoKey := etcd.GetEtcdKeyChangeFeedInfo(b.etcdClient.GetClusterID(), id.DisplayName) + + for range bumpEpochMaxRetries { + infoResp, err := b.etcdClient.GetEtcdClient().Get(ctx, infoKey) + if err != nil { + return nil, errors.Trace(err) + } + if len(infoResp.Kvs) == 0 { + return nil, errors.Trace(cerror.ErrChangeFeedNotExists.GenWithStackByArgs(id.Name())) + } + + info := &config.ChangeFeedInfo{} + if err := info.Unmarshal(infoResp.Kvs[0].Value); err != nil { + return nil, errors.Trace(err) + } + if info.ChangefeedID.Name() == "" { + info.ChangefeedID = id + } + // Keep compatibility defaults when the bumped info replaces the + // coordinator's in-memory copy after an upgrade. + info.VerifyAndComplete() + epoch, err := pdutil.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) + if err != nil { + return nil, errors.Trace(err) + } + info.Epoch = epoch + if options.State != nil { + info.State = *options.State + } + if options.UpdateError { + info.Error = options.Error + } + infoValue, err := info.Marshal() + if err != nil { + return nil, errors.Trace(err) + } + + if !options.UpdateStatus { + putResp, err := b.etcdClient.GetEtcdClient().Txn(ctx, + []clientv3.Cmp{ + clientv3.Compare(clientv3.ModRevision(infoKey), "=", infoResp.Kvs[0].ModRevision), + }, + []clientv3.Op{ + clientv3.OpPut(infoKey, infoValue), + }, + []clientv3.Op{}) + if err != nil { + return nil, errors.Trace(err) + } + if putResp.Succeeded { + return info, nil + } + + select { + case <-ctx.Done(): + return nil, errors.Trace(ctx.Err()) + case <-time.After(bumpEpochRetryDelay): + } + continue + } + + jobKey := etcd.GetEtcdKeyJob(b.etcdClient.GetClusterID(), id.DisplayName) + status, statusModRevision, err := b.etcdClient.GetChangeFeedStatus(ctx, id) + if err != nil { + return nil, errors.Trace(err) + } + status.CheckpointTs = options.CheckpointTs + status.Progress = options.Progress + statusValue, err := status.Marshal() + if err != nil { + return nil, errors.Trace(err) + } + + putResp, err := b.etcdClient.GetEtcdClient().Txn(ctx, + []clientv3.Cmp{ + clientv3.Compare(clientv3.ModRevision(infoKey), "=", infoResp.Kvs[0].ModRevision), + clientv3.Compare(clientv3.ModRevision(jobKey), "=", statusModRevision), + }, + []clientv3.Op{ + clientv3.OpPut(infoKey, infoValue), + clientv3.OpPut(jobKey, statusValue), + }, + []clientv3.Op{}) + if err != nil { + return nil, errors.Trace(err) + } + if putResp.Succeeded { + return info, nil + } + + select { + case <-ctx.Done(): + return nil, errors.Trace(ctx.Err()) + case <-time.After(bumpEpochRetryDelay): + } + } + + err := cerror.ErrMetaOpFailed.GenWithStackByArgs(fmt.Sprintf("bump changefeed epoch %s failed", id.Name())) + return nil, errors.Trace(err) +} + func (b *EtcdBackend) PauseChangefeed(ctx context.Context, id common.ChangeFeedID) error { info, err := b.etcdClient.GetChangeFeedInfo(ctx, id.DisplayName) if err != nil { @@ -263,56 +381,6 @@ func (b *EtcdBackend) DeleteChangefeed(ctx context.Context, return nil } -// ResumeChangefeed persists a resumed changefeed and returns the metadata used by the caller. -func (b *EtcdBackend) ResumeChangefeed(ctx context.Context, - id common.ChangeFeedID, newCheckpointTs uint64, -) (*config.ChangeFeedInfo, error) { - info, err := b.GetChangefeedInfo(ctx, id) - if err != nil { - return nil, errors.Trace(err) - } - // Legacy stopped changefeeds can contain sparse metadata that was completed - // during coordinator bootstrap. Complete it again before persisting the - // resumed state so backend-loaded metadata does not drop compatibility defaults. - if info.Config == nil { - info.Config = config.GetDefaultReplicaConfig() - } - info.VerifyAndComplete() - info.State = config.StateNormal - newStr, err := info.Marshal() - if err != nil { - return nil, errors.Trace(err) - } - infoKey := etcd.GetEtcdKeyChangeFeedInfo(b.etcdClient.GetClusterID(), id.DisplayName) - opsThen := []clientv3.Op{ - clientv3.OpPut(infoKey, newStr), - } - if newCheckpointTs > 0 { - status, _, err := b.etcdClient.GetChangeFeedStatus(ctx, id) - if err != nil { - return nil, errors.Trace(err) - } - status.CheckpointTs = newCheckpointTs - status.Progress = config.ProgressNone - jobValue, err := status.Marshal() - if err != nil { - return nil, errors.Trace(err) - } - jobKey := etcd.GetEtcdKeyJob(b.etcdClient.GetClusterID(), id.DisplayName) - opsThen = append(opsThen, clientv3.OpPut(jobKey, jobValue)) - } - - putResp, err := b.etcdClient.GetEtcdClient().Txn(ctx, nil, opsThen, []clientv3.Op{}) - if err != nil { - return nil, errors.Trace(err) - } - if !putResp.Succeeded { - err = cerror.ErrMetaOpFailed.GenWithStackByArgs(fmt.Sprintf("resume changefeed %s", info.ChangefeedID.Name())) - return nil, errors.Trace(err) - } - return info, nil -} - func (b *EtcdBackend) SetChangefeedProgress(ctx context.Context, id common.ChangeFeedID, progress config.Progress) error { // SetChangefeedProgress uses etcd ModRevision compare-and-swap (CAS) to avoid // overwriting a newer checkpointTs written by the checkpoint updater. diff --git a/coordinator/changefeed/etcd_backend_test.go b/coordinator/changefeed/etcd_backend_test.go index 3cab212d6f..dd90d4f38a 100644 --- a/coordinator/changefeed/etcd_backend_test.go +++ b/coordinator/changefeed/etcd_backend_test.go @@ -146,7 +146,7 @@ func TestUpdateChangefeed(t *testing.T) { require.Nil(t, backend.UpdateChangefeed(context.Background(), &config.ChangeFeedInfo{}, 2, config.ProgressStopping)) } -func TestPauseChangefeed(t *testing.T) { +func TestBumpChangefeedEpoch(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -157,18 +157,56 @@ func TestPauseChangefeed(t *testing.T) { backend := NewEtcdBackend(cdcClient) changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - info := &config.ChangeFeedInfo{State: config.StateNormal} - status := &config.ChangeFeedStatus{Progress: config.ProgressStopping} - - cdcClient.EXPECT().GetChangeFeedInfo(gomock.Any(), changefeedID.DisplayName).Return(info, nil).Times(1) - cdcClient.EXPECT().GetChangeFeedStatus(gomock.Any(), changefeedID).Return(status, int64(0), nil).Times(1) - etcdClient.EXPECT().Txn(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(&clientv3.TxnResponse{Succeeded: true}, nil).Times(1) - - err := backend.PauseChangefeed(context.Background(), changefeedID) - require.Nil(t, err) + regionThreshold := 20 + info := &config.ChangeFeedInfo{ + ChangefeedID: changefeedID, + Config: &config.ReplicaConfig{ + Scheduler: &config.ChangefeedSchedulerConfig{ + RegionThreshold: ®ionThreshold, + }, + }, + State: config.StateStopped, + Epoch: 8, + } + value, err := info.Marshal() + require.NoError(t, err) + infoKey := etcd.GetEtcdKeyChangeFeedInfo("test-cluster-id", changefeedID.DisplayName) + + etcdClient.EXPECT(). + Get(gomock.Any(), infoKey). + Return(&clientv3.GetResponse{ + Kvs: []*mvccpb.KeyValue{{ + Value: []byte(value), + ModRevision: 3, + }}, + }, nil). + Times(1) + etcdClient.EXPECT(). + Txn(gomock.Any(), gomock.Len(1), NewFuncMatcher(func(i any) bool { + ops := i.([]clientv3.Op) + require.Len(t, ops, 1) + require.True(t, ops[0].IsPut()) + persistedInfo := &config.ChangeFeedInfo{} + require.NoError(t, persistedInfo.Unmarshal(ops[0].ValueBytes())) + require.NotNil(t, persistedInfo.Config.Scheduler.RegionCountPerSpan) + require.NotZero(t, *persistedInfo.Config.Scheduler.RegionCountPerSpan) + return true + }), gomock.Len(0)). + Return(&clientv3.TxnResponse{Succeeded: true}, nil). + Times(1) + + normalState := config.StateNormal + got, err := backend.BumpChangefeedEpoch(context.Background(), changefeedID, 7, EpochBumpOptions{ + State: &normalState, + }) + require.NoError(t, err) + require.Equal(t, uint64(9), got.Epoch) + require.Equal(t, config.StateNormal, got.State) + require.NotNil(t, got.Config.Scheduler.RegionCountPerSpan) + require.NotZero(t, *got.Config.Scheduler.RegionCountPerSpan) } -func TestDeleteChangefeed(t *testing.T) { +func TestBumpChangefeedEpochUpdatesStatus(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -179,25 +217,116 @@ func TestDeleteChangefeed(t *testing.T) { backend := NewEtcdBackend(cdcClient) changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + info := &config.ChangeFeedInfo{ + ChangefeedID: changefeedID, + Config: config.GetDefaultReplicaConfig(), + Epoch: 8, + } + value, err := info.Marshal() + require.NoError(t, err) + infoKey := etcd.GetEtcdKeyChangeFeedInfo("test-cluster-id", changefeedID.DisplayName) + persistedStatus := &config.ChangeFeedStatus{ + CheckpointTs: 200, + Progress: config.ProgressNone, + } - etcdClient.EXPECT().Txn(gomock.Any(), gomock.Any(), NewFuncMatcher(func(i interface{}) bool { - ops := i.([]clientv3.Op) - require.Len(t, ops, 2) - require.True(t, ops[0].IsDelete()) - require.True(t, ops[1].IsDelete()) - return true - }), gomock.Any()).Return(&clientv3.TxnResponse{Succeeded: true}, nil).Times(1) + etcdClient.EXPECT(). + Get(gomock.Any(), infoKey). + Return(&clientv3.GetResponse{ + Kvs: []*mvccpb.KeyValue{{ + Value: []byte(value), + ModRevision: 3, + }}, + }, nil). + Times(1) + cdcClient.EXPECT(). + GetChangeFeedStatus(gomock.Any(), changefeedID). + Return(persistedStatus, int64(5), nil). + Times(1) + etcdClient.EXPECT(). + Txn(gomock.Any(), gomock.Len(2), NewFuncMatcher(func(i any) bool { + ops := i.([]clientv3.Op) + require.Len(t, ops, 2) + require.True(t, ops[0].IsPut()) + require.True(t, ops[1].IsPut()) + status := &config.ChangeFeedStatus{} + require.NoError(t, status.Unmarshal(ops[1].ValueBytes())) + require.Equal(t, uint64(300), status.CheckpointTs) + require.Equal(t, config.ProgressStopping, status.Progress) + return true + }), gomock.Len(0)). + Return(&clientv3.TxnResponse{Succeeded: true}, nil). + Times(1) + + got, err := backend.BumpChangefeedEpoch(context.Background(), changefeedID, 9, EpochBumpOptions{ + UpdateStatus: true, + CheckpointTs: 300, + Progress: config.ProgressStopping, + }) + require.NoError(t, err) + require.Equal(t, uint64(9), got.Epoch) +} - err := backend.DeleteChangefeed(context.Background(), changefeedID) - require.Nil(t, err) +func TestBumpChangefeedEpochRetriesOnCASConflict(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + cdcClient := etcd.NewMockCDCEtcdClient(ctrl) + etcdClient := etcd.NewMockClient(ctrl) + cdcClient.EXPECT().GetEtcdClient().Return(etcdClient).AnyTimes() + cdcClient.EXPECT().GetClusterID().Return("test-cluster-id").AnyTimes() + backend := NewEtcdBackend(cdcClient) + + changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + firstInfo := &config.ChangeFeedInfo{ + ChangefeedID: changefeedID, + Config: config.GetDefaultReplicaConfig(), + Epoch: 8, + } + firstValue, err := firstInfo.Marshal() + require.NoError(t, err) + secondInfo := &config.ChangeFeedInfo{ + ChangefeedID: changefeedID, + Config: config.GetDefaultReplicaConfig(), + Epoch: 9, + } + secondValue, err := secondInfo.Marshal() + require.NoError(t, err) + infoKey := etcd.GetEtcdKeyChangeFeedInfo("test-cluster-id", changefeedID.DisplayName) + + etcdClient.EXPECT(). + Get(gomock.Any(), infoKey). + Return(&clientv3.GetResponse{ + Kvs: []*mvccpb.KeyValue{{ + Value: []byte(firstValue), + ModRevision: 3, + }}, + }, nil). + Times(1) + etcdClient.EXPECT(). + Txn(gomock.Any(), gomock.Len(1), gomock.Len(1), gomock.Len(0)). + Return(&clientv3.TxnResponse{Succeeded: false}, nil). + Times(1) + etcdClient.EXPECT(). + Get(gomock.Any(), infoKey). + Return(&clientv3.GetResponse{ + Kvs: []*mvccpb.KeyValue{{ + Value: []byte(secondValue), + ModRevision: 4, + }}, + }, nil). + Times(1) + etcdClient.EXPECT(). + Txn(gomock.Any(), gomock.Len(1), gomock.Len(1), gomock.Len(0)). + Return(&clientv3.TxnResponse{Succeeded: true}, nil). + Times(1) + + got, err := backend.BumpChangefeedEpoch(context.Background(), changefeedID, 7, EpochBumpOptions{}) + require.NoError(t, err) + require.Equal(t, uint64(10), got.Epoch) } -func TestResumeChangefeed(t *testing.T) { - // Scenario: resuming a stopped changefeed persists the normal state and - // returns the metadata that was actually loaded from etcd. - // Steps: - // 1) Load legacy changefeed info without an embedded ChangefeedID. - // 2) Resume the changefeed and assert the returned info is normalized. +func TestPauseChangefeed(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -208,23 +337,18 @@ func TestResumeChangefeed(t *testing.T) { backend := NewEtcdBackend(cdcClient) changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - info := &config.ChangeFeedInfo{State: config.StateStopped} - status := &config.ChangeFeedStatus{CheckpointTs: 100} + info := &config.ChangeFeedInfo{State: config.StateNormal} + status := &config.ChangeFeedStatus{Progress: config.ProgressStopping} cdcClient.EXPECT().GetChangeFeedInfo(gomock.Any(), changefeedID.DisplayName).Return(info, nil).Times(1) cdcClient.EXPECT().GetChangeFeedStatus(gomock.Any(), changefeedID).Return(status, int64(0), nil).Times(1) etcdClient.EXPECT().Txn(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(&clientv3.TxnResponse{Succeeded: true}, nil).Times(1) - resumedInfo, err := backend.ResumeChangefeed(context.Background(), changefeedID, 200) + err := backend.PauseChangefeed(context.Background(), changefeedID) require.Nil(t, err) - require.Equal(t, config.StateNormal, resumedInfo.State) - require.Equal(t, changefeedID, resumedInfo.ChangefeedID) } -func TestResumeChangefeedCompletesLegacySchedulerDefaults(t *testing.T) { - // Scenario: an old owner persisted a stopped changefeed with only explicit scheduler fields. - // Steps: resume that sparse metadata, inspect the etcd put payload, and verify resume persists - // compatibility defaults such as RegionCountPerSpan before returning the resumed info. +func TestDeleteChangefeed(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -234,42 +358,18 @@ func TestResumeChangefeedCompletesLegacySchedulerDefaults(t *testing.T) { cdcClient.EXPECT().GetClusterID().Return("test-cluster-id").AnyTimes() backend := NewEtcdBackend(cdcClient) - changefeedID := common.NewChangeFeedIDWithName("test-scheduler-defaults", common.DefaultKeyspaceName) - enableTableAcrossNodes := false - regionThreshold := 20 - writeKeyThreshold := 10485760 - info := &config.ChangeFeedInfo{ - ChangefeedID: changefeedID, - Config: config.GetDefaultReplicaConfig(), - State: config.StateStopped, - SinkURI: "mysql://127.0.0.1:3306", - } - info.Config.Scheduler = &config.ChangefeedSchedulerConfig{ - EnableTableAcrossNodes: &enableTableAcrossNodes, - RegionThreshold: ®ionThreshold, - WriteKeyThreshold: &writeKeyThreshold, - } + changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - cdcClient.EXPECT().GetChangeFeedInfo(gomock.Any(), changefeedID.DisplayName).Return(info, nil).Times(1) etcdClient.EXPECT().Txn(gomock.Any(), gomock.Any(), NewFuncMatcher(func(i interface{}) bool { ops := i.([]clientv3.Op) - require.Len(t, ops, 1) - require.True(t, ops[0].IsPut()) - - persistedInfo := &config.ChangeFeedInfo{} - require.NoError(t, persistedInfo.Unmarshal(ops[0].ValueBytes())) - require.Equal(t, config.StateNormal, persistedInfo.State) - require.NotNil(t, persistedInfo.Config) - require.NotNil(t, persistedInfo.Config.Scheduler) - require.NotNil(t, persistedInfo.Config.Scheduler.RegionCountPerSpan) - require.Greater(t, *persistedInfo.Config.Scheduler.RegionCountPerSpan, 0) + require.Len(t, ops, 2) + require.True(t, ops[0].IsDelete()) + require.True(t, ops[1].IsDelete()) return true }), gomock.Any()).Return(&clientv3.TxnResponse{Succeeded: true}, nil).Times(1) - resumedInfo, err := backend.ResumeChangefeed(context.Background(), changefeedID, 0) - require.NoError(t, err) - require.NotNil(t, resumedInfo.Config.Scheduler.RegionCountPerSpan) - require.Greater(t, *resumedInfo.Config.Scheduler.RegionCountPerSpan, 0) + err := backend.DeleteChangefeed(context.Background(), changefeedID) + require.Nil(t, err) } func TestSetChangefeedProgress(t *testing.T) { @@ -350,16 +450,16 @@ func TestUpdateChangefeedCheckpointTs(t *testing.T) { } type FuncMarcher struct { - m func(interface{}) bool + m func(any) bool } -func NewFuncMatcher(m func(interface{}) bool) gomock.Matcher { +func NewFuncMatcher(m func(any) bool) gomock.Matcher { return &FuncMarcher{ m: m, } } -func (f *FuncMarcher) Matches(x interface{}) bool { +func (f *FuncMarcher) Matches(x any) bool { return f.m(x) } diff --git a/coordinator/changefeed/mock/changefeed_db_backend.go b/coordinator/changefeed/mock/changefeed_db_backend.go index 8db908a7b4..bd967e496d 100644 --- a/coordinator/changefeed/mock/changefeed_db_backend.go +++ b/coordinator/changefeed/mock/changefeed_db_backend.go @@ -37,6 +37,21 @@ func (m *MockBackend) EXPECT() *MockBackendMockRecorder { return m.recorder } +// BumpChangefeedEpoch mocks base method. +func (m *MockBackend) BumpChangefeedEpoch(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "BumpChangefeedEpoch", ctx, id, candidateEpoch, options) + ret0, _ := ret[0].(*config.ChangeFeedInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// BumpChangefeedEpoch indicates an expected call of BumpChangefeedEpoch. +func (mr *MockBackendMockRecorder) BumpChangefeedEpoch(ctx, id, candidateEpoch, options interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BumpChangefeedEpoch", reflect.TypeOf((*MockBackend)(nil).BumpChangefeedEpoch), ctx, id, candidateEpoch, options) +} + // CreateChangefeed mocks base method. func (m *MockBackend) CreateChangefeed(ctx context.Context, info *config.ChangeFeedInfo) error { m.ctrl.T.Helper() @@ -109,21 +124,6 @@ func (mr *MockBackendMockRecorder) PauseChangefeed(ctx, id interface{}) *gomock. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PauseChangefeed", reflect.TypeOf((*MockBackend)(nil).PauseChangefeed), ctx, id) } -// ResumeChangefeed mocks base method. -func (m *MockBackend) ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, newCheckpointTs uint64) (*config.ChangeFeedInfo, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ResumeChangefeed", ctx, id, newCheckpointTs) - ret0, _ := ret[0].(*config.ChangeFeedInfo) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// ResumeChangefeed indicates an expected call of ResumeChangefeed. -func (mr *MockBackendMockRecorder) ResumeChangefeed(ctx, id, newCheckpointTs interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResumeChangefeed", reflect.TypeOf((*MockBackend)(nil).ResumeChangefeed), ctx, id, newCheckpointTs) -} - // SetChangefeedProgress mocks base method. func (m *MockBackend) SetChangefeedProgress(ctx context.Context, id common.ChangeFeedID, progress config.Progress) error { m.ctrl.T.Helper() diff --git a/coordinator/controller.go b/coordinator/controller.go index 16ca6bf66a..a9750698d0 100644 --- a/coordinator/controller.go +++ b/coordinator/controller.go @@ -143,7 +143,7 @@ func NewController( ) *Controller { changefeedDB := changefeed.NewChangefeedDB(version) - oc := operator.NewOperatorController(selfNode, changefeedDB, backend, batchSize) + oc := operator.NewOperatorController(selfNode, changefeedDB, backend, pdClient, batchSize) messageCenter := appcontext.GetService[messaging.MessageCenter](appcontext.MessageCenter) drainController := drain.NewController(messageCenter) c := &Controller{ @@ -211,7 +211,7 @@ func NewController( for _, req := range requests { err := c.messageCenter.SendCommand(req) if err != nil { - log.Warn("send request failed when boostrapping initial node, will be resent later", + log.Warn("send request failed when bootstrapping initial node, will be resent later", zap.Any("targetNode", req.To), zap.Error(err)) } } @@ -488,7 +488,7 @@ func (c *Controller) onNodeChanged(ctx context.Context) { for _, req := range requests { err := c.messageCenter.SendCommand(req) if err != nil { - log.Warn("send request failed when boostrapping newly added node, will be resent later", + log.Warn("send request failed when bootstrapping newly added node, will be resent later", zap.Any("targetNode", req.To), zap.Error(err)) } } @@ -524,21 +524,17 @@ func (c *Controller) handleBootstrapResponses(ctx context.Context, responses map } log.Info("all new nodes bootstrap response received", zap.Int("newNodeCount", len(responses))) - // runningCfs are changefeeds that already running on other nodes - runningCfs := make(map[common.ChangeFeedID]remoteMaintainer) + // runningCfs are changefeeds that already running on other nodes. + // A changefeed can appear more than once during epoch handover: the new + // maintainer may already report while an older epoch is still closing. + runningCfs := make(map[common.ChangeFeedID][]remoteMaintainer) for nodeID, resp := range responses { for _, status := range resp.Statuses { changeFeedID := common.NewChangefeedIDFromPB(status.ChangefeedID) - if old, ok := runningCfs[changeFeedID]; ok { - log.Panic("maintainer runs on multiple node", - zap.Stringer("changefeedID", changeFeedID), - zap.Stringer("oldNode", old.nodeID), - zap.Stringer("newNode", nodeID)) - } - runningCfs[changeFeedID] = remoteMaintainer{ + runningCfs[changeFeedID] = append(runningCfs[changeFeedID], remoteMaintainer{ nodeID: nodeID, status: status, - } + }) } } recoveredStaleDrainTarget := c.recoverStaleDispatcherDrainTargetFromBootstrap(responses) @@ -559,6 +555,9 @@ func (c *Controller) handleMaintainerStatus(from node.ID, statusList []*heartbea changes = append(changes, change) } } + if len(changes) == 0 { + return + } // Try to send updated changefeeds without blocking select { @@ -584,6 +583,14 @@ func (c *Controller) handleSingleMaintainerStatus( if !c.validateMaintainerNode(cf, from, cfID) { return nil } + if !common.MaintainerEpochMatches(status.MaintainerEpoch, cf.GetInfo().Epoch) { + log.Warn("drop stale maintainer status", + zap.Stringer("changefeed", cfID), + zap.Stringer("node", from), + zap.Uint64("statusMaintainerEpoch", status.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", cf.GetInfo().Epoch)) + return nil + } change := c.updateChangefeedStatus(cf, cfID, status) return change @@ -605,10 +612,15 @@ func (c *Controller) handleNonExistentChangefeed( zap.Stringer("sourceNode", from), zap.String("status", common.FormatMaintainerStatus(status))) - keyspaceID := c.getChangefeed(cfID).GetKeyspaceID() - // Remove working changefeed from maintainer if it's not in changefeedDB - _ = c.messageCenter.SendCommand(changefeed.RemoveMaintainerMessage(keyspaceID, cfID, from, true, true)) + _ = c.messageCenter.SendCommand(changefeed.RemoveMaintainerMessage( + common.DefaultKeyspaceID, + cfID, + from, + true, + true, + status.MaintainerEpoch, + )) } } @@ -665,7 +677,7 @@ func (c *Controller) updateChangefeedStatus( // It will load all changefeeds from metastore, and compare with running changefeeds // Then initialize the changefeeds that are not running on other nodes // And construct all changefeeds state in memory. -func (c *Controller) finishBootstrap(ctx context.Context, runningChangefeeds map[common.ChangeFeedID]remoteMaintainer) { +func (c *Controller) finishBootstrap(ctx context.Context, runningChangefeeds map[common.ChangeFeedID][]remoteMaintainer) { // load all changefeeds from metastore, and check if the changefeed is already in workingMap allChangefeeds, err := c.backend.GetAllChangefeeds(ctx) if err != nil { @@ -699,10 +711,11 @@ func (c *Controller) finishBootstrap(ctx context.Context, runningChangefeeds map log.Info("load all changefeeds", zap.Int("size", len(allChangefeeds))) // Compare all changefeeds and running changefeeds, and add them to changefeedDB for cfID, cfMeta := range allChangefeeds { - rm, ok := runningChangefeeds[cfID] + // Configuration items for compatibility with older versions + cfMeta.Info.VerifyAndComplete() + remotes := runningChangefeeds[cfID] + rm, ok, staleMaintainers := selectBootstrapMaintainer(cfID, cfMeta.Info.Epoch, remotes) if !ok { - // Configuration items for compatibility with older versions - cfMeta.Info.VerifyAndComplete() // The changefeed is not running on other nodes, add it to changefeedDB. // We will create this changefeed later. cf := changefeed.NewChangefeed(cfID, cfMeta.Info, cfMeta.Status.CheckpointTs, false) @@ -718,26 +731,41 @@ func (c *Controller) finishBootstrap(ctx context.Context, runningChangefeeds map zap.String("status", common.FormatMaintainerStatus(rm.status))) cf := changefeed.NewChangefeed(cfID, cfMeta.Info, rm.status.CheckpointTs, false) c.changefeedDB.AddReplicatingMaintainer(cf, rm.nodeID) - delete(runningChangefeeds, cfID) } + delete(runningChangefeeds, cfID) // check if the changefeed is stopping or removing, we need to stop all dispatchers completely switch cfMeta.Status.Progress { case config.ProgressStopping, config.ProgressRemoving: remove := cfMeta.Status.Progress == config.ProgressRemoving - c.operatorController.StopChangefeed(ctx, cfID, remove) + if !ok && len(staleMaintainers) > 0 { + c.changefeedDB.StopByChangefeedID(cfID, remove) + } else { + c.operatorController.StopChangefeed(ctx, cfID, remove) + } + c.stopStaleBootstrapMaintainers(cfID, staleMaintainers, remove) log.Info("stop changefeed when bootstrapping", zap.String("changefeed", cfID.String()), zap.Any("meta", cfMeta)) + default: + c.stopStaleBootstrapMaintainers(cfID, staleMaintainers, false) } } // Remove the changefeeds that are not in allChangefeeds, there are stale changefeeds. - for id, rm := range runningChangefeeds { - log.Warn("maintainer not found in local, remove it", - zap.String("changefeed", id.Name()), - zap.String("node", rm.nodeID.String()), - ) - keyspaceID := c.getChangefeed(id).GetKeyspaceID() - _ = c.messageCenter.SendCommand(changefeed.RemoveMaintainerMessage(keyspaceID, id, rm.nodeID, true, true)) + for id, remotes := range runningChangefeeds { + for _, rm := range remotes { + log.Warn("maintainer not found in local, remove it", + zap.String("changefeed", id.Name()), + zap.String("node", rm.nodeID.String()), + ) + _ = c.messageCenter.SendCommand(changefeed.RemoveMaintainerMessage( + common.DefaultKeyspaceID, + id, + rm.nodeID, + true, + true, + rm.status.MaintainerEpoch, + )) + } } // start operator and scheduler @@ -751,6 +779,86 @@ func (c *Controller) finishBootstrap(ctx context.Context, runningChangefeeds map log.Info("coordinator bootstrapped", zap.Any("nodeID", c.selfNode.ID)) } +// selectBootstrapMaintainer chooses the single remote maintainer that still owns +// the persisted epoch and returns the remaining reports as stale owners to stop. +func selectBootstrapMaintainer( + cfID common.ChangeFeedID, + currentEpoch uint64, + remotes []remoteMaintainer, +) (remoteMaintainer, bool, []remoteMaintainer) { + if len(remotes) == 0 { + return remoteMaintainer{}, false, nil + } + + exactMatches := make([]remoteMaintainer, 0, len(remotes)) + compatMatches := make([]remoteMaintainer, 0, len(remotes)) + staleMaintainers := make([]remoteMaintainer, 0, len(remotes)) + for _, rm := range remotes { + statusEpoch := rm.status.MaintainerEpoch + switch { + case statusEpoch == currentEpoch: + exactMatches = append(exactMatches, rm) + case common.MaintainerEpochMatches(statusEpoch, currentEpoch): + compatMatches = append(compatMatches, rm) + default: + staleMaintainers = append(staleMaintainers, rm) + } + } + + matches := exactMatches + if len(matches) == 0 { + matches = compatMatches + } else { + staleMaintainers = append(staleMaintainers, compatMatches...) + } + if len(matches) > 1 { + log.Panic("maintainer runs on multiple node", + zap.Stringer("changefeedID", cfID), + zap.Stringer("oldNode", matches[0].nodeID), + zap.Stringer("newNode", matches[1].nodeID), + zap.Uint64("currentMaintainerEpoch", currentEpoch), + zap.Uint64("statusMaintainerEpoch", matches[0].status.MaintainerEpoch)) + } + if len(matches) == 0 { + return remoteMaintainer{}, false, staleMaintainers + } + return matches[0], true, staleMaintainers +} + +// stopStaleBootstrapMaintainers fences bootstrap reports from older owner epochs. +// If another operator already owns the changefeed slot, stale owners are removed +// with direct best-effort commands so the active operator is not replaced. +func (c *Controller) stopStaleBootstrapMaintainers( + cfID common.ChangeFeedID, + staleMaintainers []remoteMaintainer, + removed bool, +) { + for _, stale := range staleMaintainers { + log.Warn("ignore running maintainer with stale epoch when bootstrapping", + zap.String("changefeed", cfID.String()), + zap.String("node", stale.nodeID.String()), + zap.Uint64("statusMaintainerEpoch", stale.status.MaintainerEpoch), + zap.String("status", common.FormatMaintainerStatus(stale.status))) + if c.operatorController.GetOperator(cfID) != nil { + keyspaceID := common.DefaultKeyspaceID + if cf := c.changefeedDB.GetByID(cfID); cf != nil { + keyspaceID = cf.GetKeyspaceID() + } + _ = c.messageCenter.SendCommand(changefeed.RemoveMaintainerMessage( + keyspaceID, + cfID, + stale.nodeID, + true, + removed, + stale.status.MaintainerEpoch, + )) + continue + } + c.operatorController.StopRemoteMaintainerWithMaintainerEpoch( + cfID, stale.nodeID, removed, stale.status.MaintainerEpoch) + } +} + func (c *Controller) Stop() { c.taskHandlerMutex.Lock() for _, h := range c.taskHandlers { @@ -899,27 +1007,29 @@ func (c *Controller) ResumeChangefeed( return err } - resumedInfo, err := c.backend.ResumeChangefeed(ctx, id, newCheckpointTs) + checkpointTs := cf.GetStatus().CheckpointTs + if newCheckpointTs > 0 { + checkpointTs = newCheckpointTs + } + epoch := pdutil.GenerateChangefeedEpoch(ctx, c.pdClient) + normalState := config.StateNormal + info, err := c.backend.BumpChangefeedEpoch(ctx, id, epoch, changefeed.EpochBumpOptions{ + CheckpointTs: checkpointTs, + Progress: config.ProgressNone, + UpdateStatus: true, + State: &normalState, + UpdateError: true, + }) if err != nil { - return err + return errors.Trace(err) } - if resumedInfo == nil { + if info == nil { return errors.New("resumed changefeed info is nil") } - - // Use the backend-returned info so direct metadata edits made while the - // changefeed was stopped are not overwritten by the stale in-memory copy. - clone, err := resumedInfo.Clone() - if err != nil { - return err - } - - clone.State = config.StateNormal - clone.Epoch = pdutil.GenerateChangefeedEpoch(ctx, c.pdClient) - cf.SetInfo(clone) + cf.SetInfo(info) status := cf.GetStatusForResume() - status.CheckpointTs = newCheckpointTs + status.CheckpointTs = checkpointTs _, _, runningErr := cf.ForceUpdateStatus(status) if runningErr != nil { return errors.New(runningErr.Message) @@ -1064,22 +1174,28 @@ func (c *Controller) newBootstrapMessage(id node.ID, addr string) *messaging.Tar &heartbeatpb.CoordinatorBootstrapRequest{Version: c.version}) } -func (c *Controller) updateChangefeedEpoch(ctx context.Context, id common.ChangeFeedID) { +// updateChangefeedEpoch bumps the persisted owner epoch before a state change +// can create a new maintainer generation from the current coordinator. +func (c *Controller) updateChangefeedEpoch( + ctx context.Context, + id common.ChangeFeedID, + options changefeed.EpochBumpOptions, +) error { cf := c.changefeedDB.GetByID(id) if cf == nil { log.Warn("changefeed not found, skip updating epoch", zap.String("changefeed", id.String())) - return + return nil } - clonedInfo, err := cf.GetInfo().Clone() + epoch := pdutil.GenerateChangefeedEpoch(ctx, c.pdClient) + info, err := c.backend.BumpChangefeedEpoch(ctx, id, epoch, options) if err != nil { - log.Panic("clone changefeed info failed", zap.String("changefeed", id.String()), zap.Error(err)) + return errors.Trace(err) } - clonedInfo.Epoch = pdutil.GenerateChangefeedEpoch(ctx, c.pdClient) - cf.SetInfo(clonedInfo) + cf.SetInfo(info) + return nil } -// moveChangefeedToSchedulingQueue moves a changefeed to scheduling queue -// It will set a new epoch for the changefeed before moving it to scheduling queue +// moveChangefeedToSchedulingQueue moves a changefeed to scheduling queue. func (c *Controller) moveChangefeedToSchedulingQueue( id common.ChangeFeedID, resetBackoff bool, diff --git a/coordinator/controller_drain_test.go b/coordinator/controller_drain_test.go index dead3a7057..cd72ad7ca6 100644 --- a/coordinator/controller_drain_test.go +++ b/coordinator/controller_drain_test.go @@ -81,7 +81,7 @@ func newDrainTestController(t *testing.T) (*Controller, *drain.Controller, node. drainController := drain.NewController(mc) db := changefeed.NewChangefeedDB(1) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) target := node.ID("target") nodeManager.GetAliveNodes()[target] = &node.Info{ID: target} diff --git a/coordinator/controller_test.go b/coordinator/controller_test.go index 648d1f6e60..b72dc7fdc5 100644 --- a/coordinator/controller_test.go +++ b/coordinator/controller_test.go @@ -30,13 +30,26 @@ import ( appcontext "github.com/pingcap/ticdc/pkg/common/context" "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/eventservice" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/node" + pkgscheduler "github.com/pingcap/ticdc/pkg/scheduler" "github.com/pingcap/ticdc/server/watcher" + "github.com/pingcap/ticdc/utils/threadpool" "github.com/stretchr/testify/require" "go.uber.org/atomic" ) +type noopScheduler struct{} + +func (noopScheduler) Execute() time.Time { + return time.Now().Add(time.Hour) +} + +func (noopScheduler) Name() string { + return pkgscheduler.BasicScheduler +} + func TestOnPeriodTaskAdvanceLiveness(t *testing.T) { newController := func(t *testing.T) (*Controller, chan *messaging.TargetMessage, *changefeed.ChangefeedDB, node.ID) { t.Helper() @@ -59,7 +72,7 @@ func TestOnPeriodTaskAdvanceLiveness(t *testing.T) { return &Controller{ changefeedDB: changefeedDB, operatorController: operator.NewOperatorController( - self, changefeedDB, backend, 10, + self, changefeedDB, backend, nil, 10, ), nodeManager: nodeManager, initialized: atomic.NewBool(true), @@ -205,6 +218,7 @@ func TestMaintainerHeartbeatAdmissionRequiresInitializedSender(t *testing.T) { &node.Info{ID: node.ID("coordinator")}, db, nil, + nil, 10, ), bootstrapper: bootstrap.NewBootstrapper[heartbeatpb.CoordinatorBootstrapResponse]( @@ -257,6 +271,294 @@ func TestMaintainerHeartbeatAdmissionRequiresInitializedSender(t *testing.T) { require.Equal(t, uint64(200), cf.GetStatus().CheckpointTs) } +func TestMaintainerHeartbeatAdmissionDropsStaleMaintainerEpoch(t *testing.T) { + appcontext.SetService(appcontext.MessageCenter, messaging.NewMockMessageCenter()) + appcontext.SetService(watcher.NodeManagerName, watcher.NewNodeManager(nil, nil)) + + db := changefeed.NewChangefeedDB(1) + cfID := common.NewChangeFeedIDWithName("cf", common.DefaultKeyspaceName) + owner := node.ID("owner") + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "blackhole://", + State: config.StateNormal, + Epoch: 2, + }, 100, false) + db.AddReplicatingMaintainer(cf, owner) + + controller := &Controller{ + changefeedDB: db, + operatorController: operator.NewOperatorController( + &node.Info{ID: node.ID("coordinator")}, + db, + nil, + nil, + 10, + ), + } + + stale := &heartbeatpb.MaintainerStatus{ + ChangefeedID: cfID.ToPB(), + CheckpointTs: 200, + State: heartbeatpb.ComponentState_Working, + BootstrapDone: true, + MaintainerEpoch: 1, + } + require.Nil(t, controller.handleSingleMaintainerStatus(owner, stale, cfID)) + require.Equal(t, uint64(100), cf.GetStatus().CheckpointTs) + + current := &heartbeatpb.MaintainerStatus{ + ChangefeedID: cfID.ToPB(), + CheckpointTs: 200, + State: heartbeatpb.ComponentState_Working, + BootstrapDone: true, + MaintainerEpoch: 2, + } + require.NotNil(t, controller.handleSingleMaintainerStatus(owner, current, cfID)) + require.Equal(t, uint64(200), cf.GetStatus().CheckpointTs) +} + +func TestHandleNonExistentChangefeedRemovesWithReportedEpoch(t *testing.T) { + mc := messaging.NewMockMessageCenter() + db := changefeed.NewChangefeedDB(1) + controller := &Controller{ + changefeedDB: db, + operatorController: operator.NewOperatorController( + &node.Info{ID: node.ID("coordinator")}, + db, + nil, + nil, + 10, + ), + messageCenter: mc, + } + cfID := common.NewChangeFeedIDWithName("cf", common.DefaultKeyspaceName) + + controller.handleNonExistentChangefeed(cfID, node.ID("owner"), &heartbeatpb.MaintainerStatus{ + ChangefeedID: cfID.ToPB(), + State: heartbeatpb.ComponentState_Working, + MaintainerEpoch: 7, + }) + + msg := <-mc.GetMessageChannel() + req := msg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(7), req.MaintainerEpoch) + require.True(t, req.Cascade) + require.True(t, req.Removed) +} + +func TestFinishBootstrapStopsStaleEpochMaintainerWithReportedEpoch(t *testing.T) { + testCases := []struct { + name string + progress config.Progress + expectRemoved bool + expectInDB bool + expectAbsent bool + expectStopped bool + }{ + { + name: "running", + progress: config.ProgressNone, + expectInDB: true, + expectAbsent: true, + }, + { + name: "removing", + progress: config.ProgressRemoving, + expectRemoved: true, + }, + { + name: "stopping", + progress: config.ProgressStopping, + expectInDB: true, + expectStopped: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + mc := messaging.NewMockMessageCenter() + appcontext.SetService(appcontext.MessageCenter, mc) + appcontext.SetService(appcontext.SchemaStore, eventservice.NewMockSchemaStore()) + + nodeManager := watcher.NewNodeManager(nil, nil) + appcontext.SetService(watcher.NodeManagerName, nodeManager) + oldNode := node.ID("old-owner") + nodeManager.GetAliveNodes()[oldNode] = &node.Info{ID: oldNode} + + db := changefeed.NewChangefeedDB(1) + cfID := common.NewChangeFeedIDWithName(tc.name, common.DefaultKeyspaceName) + info := &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "blackhole://", + State: config.StateNormal, + Epoch: 2, + } + db.Init(map[common.ChangeFeedID]*changefeed.Changefeed{ + cfID: changefeed.NewChangefeed(cfID, info, 100, false), + }) + backend.EXPECT().GetAllChangefeeds(gomock.Any()).Return(map[common.ChangeFeedID]*changefeed.ChangefeedMetaWrapper{ + cfID: { + Info: info, + Status: &config.ChangeFeedStatus{CheckpointTs: 100, Progress: tc.progress}, + }, + }, nil).Times(1) + + self := &node.Info{ID: node.ID("coordinator")} + controller := &Controller{ + selfNode: self, + initialized: atomic.NewBool(false), + backend: backend, + changefeedDB: db, + operatorController: operator.NewOperatorController( + self, + db, + backend, + nil, + 10, + ), + nodeManager: nodeManager, + taskScheduler: threadpool.NewThreadPool(1), + scheduler: pkgscheduler.NewController(map[string]pkgscheduler.Scheduler{ + pkgscheduler.BasicScheduler: noopScheduler{}, + }), + messageCenter: mc, + } + t.Cleanup(controller.taskScheduler.Stop) + + controller.finishBootstrap(context.Background(), map[common.ChangeFeedID][]remoteMaintainer{ + cfID: {{ + nodeID: oldNode, + status: &heartbeatpb.MaintainerStatus{ + ChangefeedID: cfID.ToPB(), + State: heartbeatpb.ComponentState_Working, + CheckpointTs: 200, + BootstrapDone: true, + MaintainerEpoch: 1, + }, + }}, + }) + + if tc.expectInDB { + require.NotNil(t, db.GetByID(cfID)) + } else { + require.Nil(t, db.GetByID(cfID)) + } + if tc.expectAbsent { + require.Equal(t, 1, db.GetAbsentSize()) + } + if tc.expectStopped { + require.Equal(t, 1, db.GetStoppedSize()) + } + + op := controller.operatorController.GetOperator(cfID) + require.NotNil(t, op) + require.False(t, op.IsFinished()) + reqMsg := op.Schedule() + require.Equal(t, oldNode, reqMsg.To) + req := reqMsg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(1), req.MaintainerEpoch) + require.Equal(t, tc.expectRemoved, req.Removed) + }) + } +} + +func TestHandleBootstrapResponsesKeepsCurrentEpochAndStopsStaleDuplicate(t *testing.T) { + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + mc := messaging.NewMockMessageCenter() + oldNode := node.ID("old-owner") + currentNode := node.ID("current-owner") + nodeManager := watcher.NewNodeManager(nil, nil) + nodeManager.GetAliveNodes()[oldNode] = &node.Info{ID: oldNode} + nodeManager.GetAliveNodes()[currentNode] = &node.Info{ID: currentNode} + appcontext.SetService(appcontext.MessageCenter, mc) + appcontext.SetService(appcontext.SchemaStore, eventservice.NewMockSchemaStore()) + appcontext.SetService(watcher.NodeManagerName, nodeManager) + + cfID := common.NewChangeFeedIDWithName("duplicate-epoch", common.DefaultKeyspaceName) + info := &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "blackhole://", + State: config.StateNormal, + Epoch: 2, + } + backend.EXPECT().GetAllChangefeeds(gomock.Any()).Return(map[common.ChangeFeedID]*changefeed.ChangefeedMetaWrapper{ + cfID: { + Info: info, + Status: &config.ChangeFeedStatus{CheckpointTs: 100}, + }, + }, nil).Times(1) + + db := changefeed.NewChangefeedDB(1) + self := &node.Info{ID: node.ID("coordinator")} + controller := &Controller{ + selfNode: self, + initialized: atomic.NewBool(false), + backend: backend, + changefeedDB: db, + operatorController: operator.NewOperatorController( + self, + db, + backend, + nil, + 10, + ), + nodeManager: nodeManager, + taskScheduler: threadpool.NewThreadPool(1), + scheduler: pkgscheduler.NewController(map[string]pkgscheduler.Scheduler{ + pkgscheduler.BasicScheduler: noopScheduler{}, + }), + messageCenter: mc, + bootstrapper: bootstrap.NewBootstrapper[heartbeatpb.CoordinatorBootstrapResponse]( + "test", + func(node.ID, string) *messaging.TargetMessage { return nil }, + ), + } + t.Cleanup(controller.taskScheduler.Stop) + + require.NotPanics(t, func() { + controller.handleBootstrapResponses(context.Background(), map[node.ID]*heartbeatpb.CoordinatorBootstrapResponse{ + oldNode: { + Statuses: []*heartbeatpb.MaintainerStatus{{ + ChangefeedID: cfID.ToPB(), + State: heartbeatpb.ComponentState_Working, + CheckpointTs: 150, + BootstrapDone: true, + MaintainerEpoch: 1, + }}, + }, + currentNode: { + Statuses: []*heartbeatpb.MaintainerStatus{{ + ChangefeedID: cfID.ToPB(), + State: heartbeatpb.ComponentState_Working, + CheckpointTs: 200, + BootstrapDone: true, + MaintainerEpoch: 2, + }}, + }, + }) + }) + + cf := db.GetByID(cfID) + require.NotNil(t, cf) + require.Equal(t, currentNode, cf.GetNodeID()) + require.Equal(t, uint64(200), cf.GetStatus().CheckpointTs) + + op := controller.operatorController.GetOperator(cfID) + require.NotNil(t, op) + reqMsg := op.Schedule() + require.Equal(t, oldNode, reqMsg.To) + req := reqMsg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(1), req.MaintainerEpoch) + require.False(t, req.Removed) +} + func TestResumeChangefeed(t *testing.T) { // Scenario: resume should propagate backend failures and update in-memory state after success. // Steps: try a missing changefeed, simulate a backend resume failure, then return a persisted @@ -280,11 +582,11 @@ func TestResumeChangefeed(t *testing.T) { // no changefeed require.NotNil(t, controller.ResumeChangefeed(context.Background(), common.NewChangeFeedIDWithName("test2", common.DefaultKeyspaceName), 12, true)) - backend.EXPECT().ResumeChangefeed(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, errors.New("failed")).Times(1) + backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()).Return(nil, errors.New("failed")).Times(1) require.NotNil(t, controller.ResumeChangefeed(context.Background(), cfID, 12, true)) require.Equal(t, config.StateFailed, changefeedDB.GetByID(cfID).GetInfo().State) - backend.EXPECT().ResumeChangefeed(gomock.Any(), gomock.Any(), gomock.Any()).Return(cf.GetInfo(), nil).Times(1) + expectResumeEpochBump(t, backend, cfID, cf, 12) require.Nil(t, controller.ResumeChangefeed(context.Background(), cfID, 12, false)) require.Equal(t, config.StateNormal, changefeedDB.GetByID(cfID).GetInfo().State) } @@ -338,7 +640,7 @@ func TestResumeChangefeedOverwriteUpdatesLastSavedCheckpointTs(t *testing.T) { changefeedDB.AddStoppedChangefeed(cf) newCheckpointTs := uint64(120) - backend.EXPECT().ResumeChangefeed(gomock.Any(), gomock.Any(), gomock.Any()).Return(cf.GetInfo(), nil).Times(1) + expectResumeEpochBump(t, backend, cfID, cf, newCheckpointTs) require.Nil(t, controller.ResumeChangefeed(context.Background(), cfID, newCheckpointTs, true)) require.Equal(t, newCheckpointTs, changefeedDB.GetByID(cfID).GetLastSavedCheckPointTs()) } @@ -375,7 +677,7 @@ func TestResumeChangefeedIgnoresStaleMaintainerErrorAndSchedules(t *testing.T) { _, _, err := cf.ForceUpdateStatus(stale) require.NotNil(t, err) - backend.EXPECT().ResumeChangefeed(gomock.Any(), gomock.Any(), gomock.Any()).Return(cf.GetInfo(), nil).Times(1) + expectResumeEpochBump(t, backend, cfID, cf, 100) require.NoError(t, controller.ResumeChangefeed(context.Background(), cfID, 100, false)) // The changefeed should be enqueued for scheduling and should not be blocked by the stale error. @@ -392,8 +694,8 @@ func TestResumeChangefeedIgnoresStaleMaintainerErrorAndSchedules(t *testing.T) { func TestResumeChangefeedUsesBackendReturnedInfo(t *testing.T) { // Scenario: stopped changefeed metadata can be edited directly in the backend while // the coordinator still has an older in-memory copy. Steps: resume the changefeed - // with backend-returned info whose sink URI differs from memory, then verify the - // in-memory changefeed uses the backend value instead of overwriting it. + // with epoch-bumped backend info whose sink URI differs from memory, then verify + // the in-memory changefeed uses the backend value instead of overwriting it. ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) changefeedDB := changefeed.NewChangefeedDB(1216) @@ -414,13 +716,47 @@ func TestResumeChangefeedUsesBackendReturnedInfo(t *testing.T) { require.NoError(t, err) backendInfo.SinkURI = "mysql://upstream:4000" backendInfo.State = config.StateNormal - backend.EXPECT().ResumeChangefeed(gomock.Any(), cfID, uint64(100)).Return(backendInfo, nil).Times(1) + backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + require.NotZero(t, candidateEpoch) + require.True(t, options.UpdateStatus) + require.True(t, options.UpdateError) + require.NotNil(t, options.State) + require.Equal(t, config.StateNormal, *options.State) + backendInfo.Epoch = candidateEpoch + return backendInfo, nil + }).Times(1) require.NoError(t, controller.ResumeChangefeed(context.Background(), cfID, 100, false)) require.Equal(t, "mysql://upstream:4000", changefeedDB.GetByID(cfID).GetInfo().SinkURI) require.Equal(t, config.StateNormal, changefeedDB.GetByID(cfID).GetInfo().State) } +func expectResumeEpochBump( + t *testing.T, + backend *mock_changefeed.MockBackend, + cfID common.ChangeFeedID, + cf *changefeed.Changefeed, + checkpointTs uint64, +) { + t.Helper() + + backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + require.NotZero(t, candidateEpoch) + require.NotNil(t, options.State) + require.Equal(t, config.StateNormal, *options.State) + require.True(t, options.UpdateStatus) + require.Equal(t, checkpointTs, options.CheckpointTs) + require.Equal(t, config.ProgressNone, options.Progress) + info, err := cf.GetInfo().Clone() + require.NoError(t, err) + info.State = *options.State + info.Epoch = candidateEpoch + return info, nil + }).Times(1) +} + func TestPauseChangefeed(t *testing.T) { ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) @@ -438,7 +774,7 @@ func TestPauseChangefeed(t *testing.T) { backend: backend, changefeedDB: changefeedDB, operatorController: operator.NewOperatorController(node.NewInfo("node1", ""), - changefeedDB, backend, 10), + changefeedDB, backend, nil, 10), } cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ @@ -613,7 +949,7 @@ func TestRemoveChangefeed(t *testing.T) { backend: backend, changefeedDB: changefeedDB, operatorController: operator.NewOperatorController(node.NewInfo("node1", ""), - changefeedDB, backend, 10), + changefeedDB, backend, nil, 10), } cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ @@ -658,7 +994,7 @@ func TestListChangefeed(t *testing.T) { backend: backend, changefeedDB: changefeedDB, operatorController: operator.NewOperatorController(node.NewInfo("node1", ""), - changefeedDB, backend, 10), + changefeedDB, backend, nil, 10), } cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ @@ -705,7 +1041,7 @@ func TestCreateChangefeed(t *testing.T) { backend: backend, changefeedDB: changefeedDB, operatorController: operator.NewOperatorController(node.NewInfo("node1", ""), - changefeedDB, backend, 10), + changefeedDB, backend, nil, 10), initialized: atomic.NewBool(false), } cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) diff --git a/coordinator/coordinator.go b/coordinator/coordinator.go index be777947ec..3fc5a035f3 100644 --- a/coordinator/coordinator.go +++ b/coordinator/coordinator.go @@ -270,6 +270,21 @@ func (c *coordinator) handleStateChange( return nil } + if event.state == config.StateWarning { + warningState := config.StateWarning + currentMaintainerEpoch := currentInfo.Epoch + if err := c.controller.updateChangefeedEpoch(ctx, event.changefeedID, changefeed.EpochBumpOptions{ + State: &warningState, + Error: event.err, + UpdateError: true, + }); err != nil { + return errors.Trace(err) + } + c.controller.operatorController.StopChangefeedWithMaintainerEpoch(ctx, event.changefeedID, false, currentMaintainerEpoch) + c.controller.moveChangefeedToSchedulingQueue(event.changefeedID, false, false) + return nil + } + cfInfo, err := currentInfo.Clone() if err != nil { return errors.Trace(err) @@ -287,15 +302,9 @@ func (c *coordinator) handleStateChange( } cf.SetInfo(cfInfo) - switch event.state { - case config.StateWarning: - c.controller.operatorController.StopChangefeed(ctx, event.changefeedID, false) - c.controller.updateChangefeedEpoch(ctx, event.changefeedID) - c.controller.moveChangefeedToSchedulingQueue(event.changefeedID, false, false) - case config.StateFailed, config.StateFinished: + if event.state == config.StateFailed || event.state == config.StateFinished { failpoint.Inject("BlockBeforeStopChangefeed", func() {}) c.controller.operatorController.StopChangefeed(ctx, event.changefeedID, false) - default: } return nil } diff --git a/coordinator/coordinator_test.go b/coordinator/coordinator_test.go index 8c6766e880..9566b2cb16 100644 --- a/coordinator/coordinator_test.go +++ b/coordinator/coordinator_test.go @@ -45,6 +45,7 @@ import ( "github.com/pingcap/ticdc/pkg/pdutil" "github.com/pingcap/ticdc/server/watcher" "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/oracle" pd "github.com/tikv/pd/client" pdgc "github.com/tikv/pd/client/clients/gc" "go.uber.org/zap" @@ -61,6 +62,10 @@ func (m *mockPdClient) UpdateServiceGCSafePoint(ctx context.Context, serviceID s return safePoint, nil } +func (m *mockPdClient) GetTS(ctx context.Context) (int64, int64, error) { + return oracle.GetPhysical(time.Now()), 0, nil +} + func (m *mockPdClient) GetGCStatesClient(keyspaceID uint32) pdgc.GCStatesClient { m.mu.Lock() defer m.mu.Unlock() @@ -383,6 +388,53 @@ func (m *mockEtcdClient) GetOwnerID(ctx context.Context) (config.CaptureID, erro return config.CaptureID(m.ownerID), nil } +func mockBumpChangefeedEpoch( + backend *mock_changefeed.MockBackend, + cfs map[common.ChangeFeedID]*changefeed.ChangefeedMetaWrapper, +) { + var mu sync.Mutex + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + _ context.Context, + id common.ChangeFeedID, + candidateEpoch uint64, + options changefeed.EpochBumpOptions, + ) (*config.ChangeFeedInfo, error) { + mu.Lock() + defer mu.Unlock() + + cf, ok := cfs[id] + if !ok { + return nil, fmt.Errorf("changefeed %s not found", id.String()) + } + info, err := cf.Info.Clone() + if err != nil { + return nil, err + } + info.Epoch, err = pdutil.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) + if err != nil { + return nil, err + } + if options.State != nil { + info.State = *options.State + } + if options.UpdateError { + info.Error = options.Error + } + cf.Info = info + if cf.Status == nil { + cf.Status = &config.ChangeFeedStatus{} + } + if options.UpdateStatus { + cf.Status.CheckpointTs = options.CheckpointTs + cf.Status.Progress = options.Progress + } + return info, nil + }). + AnyTimes() +} + func TestCoordinatorScheduling(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/debug/pprof/", pprof.Index) @@ -433,6 +485,7 @@ func TestCoordinatorScheduling(t *testing.T) { cfs := make(map[common.ChangeFeedID]*changefeed.ChangefeedMetaWrapper) backend.EXPECT().GetAllChangefeeds(gomock.Any()).Return(cfs, nil).AnyTimes() backend.EXPECT().UpdateChangefeed(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + mockBumpChangefeedEpoch(backend, cfs) for i := 0; i < cfSize; i++ { cfID := common.NewChangeFeedIDWithDisplayName(common.ChangeFeedDisplayName{ Name: fmt.Sprintf("%d", i), @@ -505,6 +558,7 @@ func TestScaleNode(t *testing.T) { } backend.EXPECT().GetAllChangefeeds(gomock.Any()).Return(cfs, nil).AnyTimes() backend.EXPECT().UpdateChangefeed(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + mockBumpChangefeedEpoch(backend, cfs) cr := New(info, &mockPdClient{}, backend, serviceID, 100, 10000, time.Millisecond*1) @@ -863,6 +917,7 @@ func TestHandleStateChangeSkipsDuplicateRuntimeStatePersistence(t *testing.T) { self, changefeedDB, backend, + nil, 10, ), } @@ -903,6 +958,81 @@ func TestHandleStateChangeSkipsDuplicateRuntimeStatePersistence(t *testing.T) { require.Equal(t, 1, changefeedDB.GetReplicatingSize()) } +func TestHandleStateChangeBumpsEpochForWarningState(t *testing.T) { + ctrl := gomock.NewController(t) + t.Cleanup(ctrl.Finish) + + backend := mock_changefeed.NewMockBackend(ctrl) + changefeedDB := changefeed.NewChangefeedDB(1216) + self := node.NewInfo("localhost:8300", "") + nodeManager := watcher.NewNodeManager(nil, nil) + nodeManager.GetAliveNodes()[self.ID] = self + appcontext.SetService(appcontext.MessageCenter, messaging.NewMockMessageCenter()) + appcontext.SetService(watcher.NodeManagerName, nodeManager) + + controller := &Controller{ + backend: backend, + changefeedDB: changefeedDB, + operatorController: operator.NewOperatorController( + self, + changefeedDB, + backend, + nil, + 10, + ), + } + co := &coordinator{ + backend: backend, + controller: controller, + } + + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + oldEpoch := uint64(233) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + State: config.StateNormal, + SinkURI: "mysql://127.0.0.1:3306", + Epoch: oldEpoch, + }, 1, false) + changefeedDB.AddReplicatingMaintainer(cf, self.ID) + + newError := &config.RunningError{ + Time: time.Unix(2, 0), + Addr: "127.0.0.1:8300", + Code: "CDC:ErrSinkURIInvalid", + Message: "sink uri invalid", + } + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + require.NotZero(t, candidateEpoch) + require.NotNil(t, options.State) + require.Equal(t, config.StateWarning, *options.State) + require.True(t, options.UpdateError) + require.Equal(t, newError, options.Error) + require.False(t, options.UpdateStatus) + info, err := cf.GetInfo().Clone() + require.NoError(t, err) + info.State = *options.State + info.Error = options.Error + info.Epoch = candidateEpoch + return info, nil + }). + Times(1) + + event := newChangefeedChange(cf, config.StateWarning, ChangeState, newError) + require.NoError(t, co.handleStateChange(context.Background(), event)) + + require.Equal(t, config.StateWarning, cf.GetInfo().State) + require.Equal(t, newError, cf.GetInfo().Error) + require.Greater(t, cf.GetInfo().Epoch, oldEpoch) + op := controller.operatorController.GetOperator(cfID) + require.NotNil(t, op) + req := op.Schedule().Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, oldEpoch, req.MaintainerEpoch) +} + func TestHandleStateChangeSkipsNilChangefeedInfo(t *testing.T) { ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) diff --git a/coordinator/create_changefeed_gc_test.go b/coordinator/create_changefeed_gc_test.go index 84ff8f277c..35ce621d51 100644 --- a/coordinator/create_changefeed_gc_test.go +++ b/coordinator/create_changefeed_gc_test.go @@ -59,6 +59,7 @@ func newTestCoordinatorWithGCManager( self, changefeedDB, backend, + nil, 10, ), initialized: atomic.NewBool(true), diff --git a/coordinator/operator/operator_add.go b/coordinator/operator/operator_add.go index ecf2e4af37..11db925623 100644 --- a/coordinator/operator/operator_add.go +++ b/coordinator/operator/operator_add.go @@ -72,6 +72,7 @@ func (m *AddMaintainerOperator) Check(from node.ID, status *heartbeatpb.Maintain // Require bootstrap to be done before considering the maintainer successfully started. // This avoids false positives when a removal-only maintainer reports Working. if !m.finished.Load() && from == m.dest && + common.MaintainerEpochMatches(status.MaintainerEpoch, m.cf.GetInfo().Epoch) && status.State == heartbeatpb.ComponentState_Working && status.BootstrapDone { log.Info("maintainer report working status", diff --git a/coordinator/operator/operator_add_test.go b/coordinator/operator/operator_add_test.go index 8c272b4697..3192275eb0 100644 --- a/coordinator/operator/operator_add_test.go +++ b/coordinator/operator/operator_add_test.go @@ -18,6 +18,8 @@ import ( "github.com/pingcap/ticdc/coordinator/changefeed" "github.com/pingcap/ticdc/heartbeatpb" + "github.com/pingcap/ticdc/pkg/common" + "github.com/pingcap/ticdc/pkg/config" "github.com/stretchr/testify/require" ) @@ -52,7 +54,14 @@ func TestAddMaintainerOperator_OnTaskRemoved(t *testing.T) { // TestAddMaintainerOperator_CheckRequiresBootstrapDone verifies that the operator only // completes after it observes a Working status with BootstrapDone from the destination node. func TestAddMaintainerOperator_CheckRequiresBootstrapDone(t *testing.T) { - op := NewAddMaintainerOperator(nil, &changefeed.Changefeed{}, "n1") + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 1, + }, 1, true) + op := NewAddMaintainerOperator(nil, cf, "n1") op.Check("n1", &heartbeatpb.MaintainerStatus{ State: heartbeatpb.ComponentState_Working, @@ -61,8 +70,28 @@ func TestAddMaintainerOperator_CheckRequiresBootstrapDone(t *testing.T) { require.False(t, op.finished.Load()) op.Check("n1", &heartbeatpb.MaintainerStatus{ - State: heartbeatpb.ComponentState_Working, - BootstrapDone: true, + State: heartbeatpb.ComponentState_Working, + BootstrapDone: true, + MaintainerEpoch: 1, }) require.True(t, op.finished.Load()) } + +func TestAddMaintainerOperator_CheckAcceptsCompatEpochDuringRollingUpgrade(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 2, + }, 1, true) + op := NewAddMaintainerOperator(nil, cf, "n1") + status := &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Working, + BootstrapDone: true, + MaintainerEpoch: 0, + } + + op.Check("n1", status) + require.True(t, op.finished.Load()) +} diff --git a/coordinator/operator/operator_controller.go b/coordinator/operator/operator_controller.go index 0c33349a92..170ce36242 100644 --- a/coordinator/operator/operator_controller.go +++ b/coordinator/operator/operator_controller.go @@ -24,27 +24,36 @@ import ( "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/pkg/common" appcontext "github.com/pingcap/ticdc/pkg/common/context" + "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/metrics" "github.com/pingcap/ticdc/pkg/node" + "github.com/pingcap/ticdc/pkg/pdutil" "github.com/pingcap/ticdc/pkg/scheduler/operator" "github.com/pingcap/ticdc/server/watcher" + pd "github.com/tikv/pd/client" "go.uber.org/zap" ) +const operatorEpochBumpTimeout = 10 * time.Second + // Controller is the operator controller, it manages all operators. // And the Controller is responsible for the execution of the operator. type Controller struct { mu sync.RWMutex - role string - changefeedDB *changefeed.ChangefeedDB - operators map[common.ChangeFeedID]*operator.OperatorWithTime[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] + role string + changefeedDB *changefeed.ChangefeedDB + operators map[common.ChangeFeedID]*operator.OperatorWithTime[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] + // epochBumping reserves an owner-changing operator slot while its epoch bump + // runs outside mu, so another add or move cannot persist a newer epoch first. + epochBumping map[common.ChangeFeedID]struct{} runningQueue operator.OperatorQueue[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] batchSize int messageCenter messaging.MessageCenter selfNode *node.Info backend changefeed.Backend + pdClient pd.Client nodeManger *watcher.NodeManager } @@ -52,17 +61,20 @@ func NewOperatorController( selfNode *node.Info, db *changefeed.ChangefeedDB, backend changefeed.Backend, + pdClient pd.Client, batchSize int, ) *Controller { oc := &Controller{ role: "coordinator", operators: make(map[common.ChangeFeedID]*operator.OperatorWithTime[common.ChangeFeedID, *heartbeatpb.MaintainerStatus]), + epochBumping: make(map[common.ChangeFeedID]struct{}), runningQueue: make(operator.OperatorQueue[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], 0), messageCenter: appcontext.GetService[messaging.MessageCenter](appcontext.MessageCenter), batchSize: batchSize, changefeedDB: db, selfNode: selfNode, backend: backend, + pdClient: pdClient, nodeManger: appcontext.GetService[*watcher.NodeManager](watcher.NodeManagerName), } return oc @@ -86,10 +98,17 @@ func (oc *Controller) Execute() time.Time { oc.mu.RUnlock() if msg != nil { - _ = oc.messageCenter.SendCommand(msg) - log.Info("send command to maintainer", - zap.String("role", oc.role), - zap.String("operator", r.String())) + err := oc.messageCenter.SendCommand(msg) + if err != nil { + log.Warn("send command to maintainer failed", + zap.String("role", oc.role), + zap.String("operator", r.String()), + zap.Error(err)) + } else { + log.Info("send command to maintainer", + zap.String("role", oc.role), + zap.String("operator", r.String())) + } } executedItem++ if executedItem >= oc.batchSize { @@ -100,34 +119,207 @@ func (oc *Controller) Execute() time.Time { // AddOperator adds an operator to the controller, if the operator already exists, return false. func (oc *Controller) AddOperator(op operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus]) bool { + oc.mu.Lock() + cf, ok := oc.precheckAddOperatorLocked(op) + if !ok { + oc.mu.Unlock() + return false + } + shouldBumpEpoch := oc.shouldBumpChangefeedEpoch(op) + if !shouldBumpEpoch { + oc.pushOperator(op) + oc.mu.Unlock() + return true + } + oc.epochBumping[op.ID()] = struct{}{} + // Epoch bump may call PD and etcd, so keep it outside oc.mu. Recheck the + // operator slot before pushing because stop or another operator may win + // while the bump is in flight. + oc.mu.Unlock() + + info, err := oc.bumpChangefeedEpoch(cf) oc.mu.Lock() defer oc.mu.Unlock() + delete(oc.epochBumping, op.ID()) + if err != nil { + log.Warn("add operator failed, cannot bump changefeed epoch", + zap.String("role", oc.role), + zap.String("operator", op.String()), + zap.Stringer("changefeed", op.ID()), + zap.Error(err)) + return false + } + + if !oc.recheckAddOperatorLocked(op, cf) { + return false + } + cf.SetInfo(info) + oc.pushOperator(op) + return true +} +// precheckAddOperatorLocked reserves the changefeed for an add-like operator +// before any expensive epoch bump is started. The caller must hold oc.mu. +func (oc *Controller) precheckAddOperatorLocked( + op operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], +) (*changefeed.Changefeed, bool) { if pre, ok := oc.operators[op.ID()]; ok { log.Info("add operator failed, operator already exists", zap.String("role", oc.role), zap.Stringer("operator", op), zap.Stringer("previousOperator", pre.OP)) - return false + return nil, false + } + if _, ok := oc.epochBumping[op.ID()]; ok { + log.Info("add operator failed, epoch bump already in progress", + zap.String("role", oc.role), + zap.Stringer("operator", op)) + return nil, false } cf := oc.changefeedDB.GetByID(op.ID()) if cf == nil { log.Warn("add operator failed, changefeed not found", zap.String("role", oc.role), zap.String("operator", op.String())) + return nil, false + } + return cf, true +} + +// recheckAddOperatorLocked verifies the original changefeed still owns the slot +// after the epoch bump completed outside oc.mu. +func (oc *Controller) recheckAddOperatorLocked( + op operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], + cf *changefeed.Changefeed, +) bool { + if pre, ok := oc.operators[op.ID()]; ok { + log.Info("add operator failed, operator already exists after epoch bump", + zap.String("role", oc.role), + zap.Stringer("operator", op), zap.Stringer("previousOperator", pre.OP)) + return false + } + current := oc.changefeedDB.GetByID(op.ID()) + if current == nil { + log.Warn("add operator failed, changefeed not found after epoch bump", + zap.String("role", oc.role), + zap.String("operator", op.String())) + return false + } + if current != cf { + log.Warn("add operator failed, changefeed changed after epoch bump", + zap.String("role", oc.role), + zap.String("operator", op.String())) return false } - oc.pushOperator(op) return true } -// StopChangefeed stop changefeed when the changefeed is stopped/removed. -// if remove is true, it will remove the changefeed from the chagnefeed DB -// if remove is false, it only marks as the changefeed stooped in changefeed DB, so we will not schedule the changefeed again +// shouldBumpChangefeedEpoch gates epoch bumps to operators that create a new +// maintainer owner and to runtime paths that have a real metastore and PD client. +func (oc *Controller) shouldBumpChangefeedEpoch( + op operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], +) bool { + return requiresNewMaintainerOwnership(op) && oc.pdClient != nil && oc.backend != nil +} + +// bumpChangefeedEpoch persists the next owner epoch before the operator is queued. +func (oc *Controller) bumpChangefeedEpoch(cf *changefeed.Changefeed) (*config.ChangeFeedInfo, error) { + ctx, cancel := context.WithTimeout(context.Background(), operatorEpochBumpTimeout) + defer cancel() + + epoch := pdutil.GenerateChangefeedEpoch(ctx, oc.pdClient) + return oc.backend.BumpChangefeedEpoch( + ctx, + cf.ID, + epoch, + changefeed.EpochBumpOptions{}, + ) +} + +// requiresNewMaintainerOwnership identifies operators that will create a new +// maintainer generation and therefore must advance the persisted owner epoch. +func requiresNewMaintainerOwnership( + op operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], +) bool { + switch op.(type) { + case *AddMaintainerOperator, *MoveMaintainerOperator: + return true + default: + return false + } +} + +// StopChangefeed stops a changefeed when the changefeed is stopped or removed. +// if remove is true, it will remove the changefeed from the changefeed DB +// if remove is false, it only marks the changefeed stopped in changefeed DB, so we will not schedule the changefeed again func (oc *Controller) StopChangefeed(_ context.Context, cfID common.ChangeFeedID, removed bool) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + return oc.stopChangefeed(cfID, removed, 0, false) +} + +// StopChangefeedWithMaintainerEpoch stops the current maintainer with the epoch +// it already owns, even if the in-memory changefeed has advanced to a newer +// ownership epoch. +func (oc *Controller) StopChangefeedWithMaintainerEpoch( + _ context.Context, + cfID common.ChangeFeedID, + removed bool, + maintainerEpoch uint64, +) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + return oc.stopChangefeed(cfID, removed, maintainerEpoch, true) +} + +// StopRemoteMaintainerWithMaintainerEpoch stops a reported maintainer without +// changing the local changefeed placement. It is used during coordinator +// bootstrap when the reported maintainer is from an old ownership epoch and +// must occupy the operator slot until the old owner stops. +func (oc *Controller) StopRemoteMaintainerWithMaintainerEpoch( + cfID common.ChangeFeedID, + nodeID node.ID, + removed bool, + maintainerEpoch uint64, +) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + oc.mu.Lock() + defer oc.mu.Unlock() + + keyspaceID := common.DefaultKeyspaceID + changefeed := oc.changefeedDB.GetByID(cfID) + if changefeed != nil { + keyspaceID = changefeed.GetKeyspaceID() + } + return oc.pushStopChangefeedOperator(keyspaceID, cfID, nodeID, removed, maintainerEpoch) +} + +// stopChangefeed creates a stop operator using the owner epoch that must be +// fenced. During a move, the origin epoch is kept until the target is bound. +func (oc *Controller) stopChangefeed( + cfID common.ChangeFeedID, + removed bool, + maintainerEpoch uint64, + hasMaintainerEpoch bool, +) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { oc.mu.Lock() defer oc.mu.Unlock() + changefeed := oc.changefeedDB.GetByID(cfID) + keyspaceID := common.DefaultKeyspaceID + if changefeed != nil { + keyspaceID = changefeed.GetKeyspaceID() + if !hasMaintainerEpoch { + maintainerEpoch = changefeed.GetInfo().Epoch + } + } + + var originNode node.ID + var originEpoch uint64 + var useOriginEpoch bool + if !hasMaintainerEpoch { + originNode, originEpoch, useOriginEpoch = oc.moveOriginStopTargetLocked(cfID) + } + scheduledNode := oc.changefeedDB.StopByChangefeedID(cfID, removed) + if useOriginEpoch { + scheduledNode = originNode + maintainerEpoch = originEpoch + } if scheduledNode == "" { log.Info("changefeed is not scheduled, try stop maintainer using coordinator node", zap.String("role", oc.role), @@ -136,21 +328,38 @@ func (oc *Controller) StopChangefeed(_ context.Context, cfID common.ChangeFeedID scheduledNode = oc.selfNode.ID } - changefeed := oc.changefeedDB.GetByID(cfID) - keyspaceID := changefeed.GetKeyspaceID() + return oc.pushStopChangefeedOperator(keyspaceID, cfID, scheduledNode, removed, maintainerEpoch) +} - return oc.pushStopChangefeedOperator(keyspaceID, cfID, scheduledNode, removed) +// moveOriginStopTargetLocked returns the origin maintainer stop target for an +// in-flight move. The caller must hold oc.mu. +func (oc *Controller) moveOriginStopTargetLocked(cfID common.ChangeFeedID) (node.ID, uint64, bool) { + old, ok := oc.operators[cfID] + if !ok { + return "", 0, false + } + moveOp, ok := old.OP.(*MoveMaintainerOperator) + if !ok { + return "", 0, false + } + return moveOp.originStopTarget() } // pushStopChangefeedOperator pushes a stop changefeed operator to the controller. // it checks if the operator already exists, if exists, it will replace the old one. // if the old operator is the removing operator, it will skip this operator. -func (oc *Controller) pushStopChangefeedOperator(keyspaceID uint32, cfID common.ChangeFeedID, nodeID node.ID, remove bool) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { - op := NewStopChangefeedOperator(keyspaceID, cfID, nodeID, oc.selfNode.ID, oc.backend, remove) +func (oc *Controller) pushStopChangefeedOperator( + keyspaceID uint32, + cfID common.ChangeFeedID, + nodeID node.ID, + remove bool, + maintainerEpoch uint64, +) operator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + op := NewStopChangefeedOperator(keyspaceID, cfID, nodeID, oc.selfNode.ID, oc.backend, remove, maintainerEpoch) if old, ok := oc.operators[cfID]; ok { oldStop, ok := old.OP.(*StopChangefeedOperator) if ok { - if oldStop.changefeedIsRemoved { + if oldStop.changefeedRemoved { log.Info("changefeed is in removing progress, skip the stop operator", zap.String("role", oc.role), zap.String("changefeed", cfID.Name())) diff --git a/coordinator/operator/operator_controller_test.go b/coordinator/operator/operator_controller_test.go index fc0503bcac..8589972694 100644 --- a/coordinator/operator/operator_controller_test.go +++ b/coordinator/operator/operator_controller_test.go @@ -15,7 +15,10 @@ package operator import ( "context" + "encoding/json" + "sync" "testing" + "time" "github.com/golang/mock/gomock" "github.com/pingcap/ticdc/coordinator/changefeed" @@ -26,14 +29,28 @@ import ( "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/node" + scheduleroperator "github.com/pingcap/ticdc/pkg/scheduler/operator" "github.com/pingcap/ticdc/server/watcher" "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/oracle" + pd "github.com/tikv/pd/client" ) +type operatorEpochPDClient struct { + pd.Client + physical int64 + logical int64 +} + +func (m *operatorEpochPDClient) GetTS(ctx context.Context) (int64, int64, error) { + return m.physical, m.logical, nil +} + func newOperatorControllerForTest( t *testing.T, changefeedDB *changefeed.ChangefeedDB, backend changefeed.Backend, + pdClient pd.Client, ) (*Controller, *node.Info, *watcher.NodeManager) { t.Helper() @@ -44,14 +61,14 @@ func newOperatorControllerForTest( appcontext.SetService(appcontext.MessageCenter, messaging.NewMockMessageCenter()) appcontext.SetService(watcher.NodeManagerName, nodeManager) - return NewOperatorController(self, changefeedDB, backend, 10), self, nodeManager + return NewOperatorController(self, changefeedDB, backend, pdClient, 10), self, nodeManager } func TestController_StopChangefeed(t *testing.T) { changefeedDB := changefeed.NewChangefeedDB(1216) ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - oc, self, _ := newOperatorControllerForTest(t, changefeedDB, backend) + oc, self, _ := newOperatorControllerForTest(t, changefeedDB, backend, nil) cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ ChangefeedID: cfID, @@ -71,11 +88,66 @@ func TestController_StopChangefeed(t *testing.T) { require.Len(t, oc.operators, 1) } +func TestController_StopChangefeedWithMaintainerEpoch(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, self, _ := newOperatorControllerForTest(t, changefeedDB, backend, nil) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 20, + }, + 1, true) + changefeedDB.AddReplicatingMaintainer(cf, self.ID) + + op := oc.StopChangefeedWithMaintainerEpoch(context.Background(), cfID, false, 10) + req := op.Schedule().Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(10), req.MaintainerEpoch) +} + +func TestController_StopRemoteMaintainerWithMaintainerEpoch(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, _, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, nil) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 20, + }, 1, true) + changefeedDB.AddAbsentChangefeed(cf) + + oldNode := node.ID("old-node") + nodeManager.GetAliveNodes()[oldNode] = &node.Info{ID: oldNode} + op := oc.StopRemoteMaintainerWithMaintainerEpoch(cfID, oldNode, false, 10) + reqMsg := op.Schedule() + require.Equal(t, oldNode, reqMsg.To) + req := reqMsg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, uint64(10), req.MaintainerEpoch) + + op.Check(oldNode, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: 20, + }) + require.False(t, op.IsFinished()) + + op.Check(oldNode, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: 10, + }) + require.True(t, op.IsFinished()) +} + func TestController_AddOperator(t *testing.T) { changefeedDB := changefeed.NewChangefeedDB(1216) ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend) + oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, nil) target := node.NewInfo("localhost:8301", "") nodeManager.GetAliveNodes()[target.ID] = target cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) @@ -109,7 +181,7 @@ func TestController_HasOperatorInvolvingNode(t *testing.T) { changefeedDB := changefeed.NewChangefeedDB(1216) ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend) + oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, nil) target := node.NewInfo("localhost:8301", "") nodeManager.GetAliveNodes()[target.ID] = target @@ -133,7 +205,7 @@ func TestController_CountMoveMaintainerOperatorsFromNodes(t *testing.T) { changefeedDB := changefeed.NewChangefeedDB(1216) ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend) + oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, nil) dest := node.NewInfo("localhost:8301", "") nodeManager.GetAliveNodes()[dest.ID] = dest @@ -151,12 +223,329 @@ func TestController_CountMoveMaintainerOperatorsFromNodes(t *testing.T) { require.Equal(t, 0, oc.CountMoveMaintainerOperatorsFromNodes([]node.ID{"n3"})) } +func TestController_AddOperatorBumpsAndPersistsOwnershipEpoch(t *testing.T) { + testCases := []struct { + name string + addToDB func(*changefeed.ChangefeedDB, *changefeed.Changefeed, node.ID) + newOp func(*changefeed.ChangefeedDB, *changefeed.Changefeed, node.ID, node.ID) scheduleroperator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] + }{ + { + name: "add-maintainer", + addToDB: func(db *changefeed.ChangefeedDB, cf *changefeed.Changefeed, self node.ID) { + db.AddAbsentChangefeed(cf) + }, + newOp: func(db *changefeed.ChangefeedDB, cf *changefeed.Changefeed, self, dest node.ID) scheduleroperator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + return NewAddMaintainerOperator(db, cf, dest) + }, + }, + { + name: "move-maintainer", + addToDB: func(db *changefeed.ChangefeedDB, cf *changefeed.Changefeed, self node.ID) { + db.AddReplicatingMaintainer(cf, self) + }, + newOp: func(db *changefeed.ChangefeedDB, cf *changefeed.Changefeed, self, dest node.ID) scheduleroperator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] { + return NewMoveMaintainerOperator(db, cf, self, dest) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, &operatorEpochPDClient{physical: 100, logical: 1}) + target := node.NewInfo("localhost:8301", "") + nodeManager.GetAliveNodes()[target.ID] = target + + candidateEpoch := oracle.ComposeTS(100, 1) + oldEpoch := candidateEpoch + 10 + expectedEpoch := oldEpoch + 1 + cfID := common.NewChangeFeedIDWithName(tc.name, common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: oldEpoch, + }, 123, true) + tc.addToDB(changefeedDB, cf, self.ID) + + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), changefeed.EpochBumpOptions{}). + DoAndReturn(func(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + require.NotZero(t, candidateEpoch) + require.Equal(t, cfID, id) + require.False(t, options.UpdateStatus) + info, err := cf.GetInfo().Clone() + require.NoError(t, err) + require.Equal(t, oldEpoch, info.Epoch) + info.Epoch = expectedEpoch + return info, nil + }). + Times(1) + + op := tc.newOp(changefeedDB, cf, self.ID, target.ID) + require.True(t, oc.AddOperator(op)) + require.Equal(t, expectedEpoch, cf.GetInfo().Epoch) + if move, ok := op.(*MoveMaintainerOperator); ok { + removeReq := move.Schedule().Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, oldEpoch, removeReq.MaintainerEpoch) + + move.Check(self.ID, &heartbeatpb.MaintainerStatus{State: heartbeatpb.ComponentState_Stopped}) + addReq := move.Schedule().Message[0].(*heartbeatpb.AddMaintainerRequest) + info := &config.ChangeFeedInfo{} + require.NoError(t, json.Unmarshal(addReq.Config, info)) + require.Equal(t, expectedEpoch, info.Epoch) + } + + req := cf.NewAddMaintainerMessage(target.ID).Message[0].(*heartbeatpb.AddMaintainerRequest) + info := &config.ChangeFeedInfo{} + require.NoError(t, json.Unmarshal(req.Config, info)) + require.Equal(t, expectedEpoch, info.Epoch) + }) + } +} + +func TestController_AddOperatorRejectsConcurrentEpochBump(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, _, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, &operatorEpochPDClient{physical: 100, logical: 1}) + target1 := node.NewInfo("localhost:8301", "") + target2 := node.NewInfo("localhost:8302", "") + nodeManager.GetAliveNodes()[target1.ID] = target1 + nodeManager.GetAliveNodes()[target2.ID] = target2 + + cfID := common.NewChangeFeedIDWithName("concurrent-epoch-bump", common.DefaultKeyspaceName) + oldEpoch := uint64(10) + newEpoch := uint64(20) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: oldEpoch, + }, 123, true) + changefeedDB.AddAbsentChangefeed(cf) + + bumpStarted := make(chan struct{}) + releaseBump := make(chan struct{}) + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), changefeed.EpochBumpOptions{}). + DoAndReturn(func(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + close(bumpStarted) + <-releaseBump + info, err := cf.GetInfo().Clone() + require.NoError(t, err) + info.Epoch = newEpoch + return info, nil + }). + Times(1) + + firstResult := make(chan bool, 1) + go func() { + firstResult <- oc.AddOperator(NewAddMaintainerOperator(changefeedDB, cf, target1.ID)) + }() + select { + case <-bumpStarted: + case <-time.After(time.Second): + close(releaseBump) + require.FailNow(t, "timed out waiting for epoch bump to start") + } + + require.False(t, oc.AddOperator(NewAddMaintainerOperator(changefeedDB, cf, target2.ID))) + close(releaseBump) + select { + case ok := <-firstResult: + require.True(t, ok) + case <-time.After(time.Second): + require.FailNow(t, "timed out waiting for epoch bump to finish") + } + require.Equal(t, newEpoch, cf.GetInfo().Epoch) + require.NotNil(t, oc.GetOperator(cfID)) +} + +func TestController_StopChangefeedDuringMoveUsesOriginEpoch(t *testing.T) { + testCases := []struct { + name string + originStoppedBeforeStop bool + }{ + { + name: "before-origin-stopped", + }, + { + name: "after-origin-stopped-before-dest-bind", + originStoppedBeforeStop: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, self, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, &operatorEpochPDClient{physical: 100, logical: 1}) + target := node.NewInfo("localhost:8301", "") + nodeManager.GetAliveNodes()[target.ID] = target + + oldEpoch := uint64(10) + newEpoch := uint64(20) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: oldEpoch, + }, 1, true) + changefeedDB.AddReplicatingMaintainer(cf, self.ID) + + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), changefeed.EpochBumpOptions{}). + DoAndReturn(func(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + info, err := cf.GetInfo().Clone() + require.NoError(t, err) + info.Epoch = newEpoch + return info, nil + }). + Times(1) + + moveOp := NewMoveMaintainerOperator(changefeedDB, cf, self.ID, target.ID) + require.True(t, oc.AddOperator(moveOp)) + require.Equal(t, newEpoch, cf.GetInfo().Epoch) + + if tc.originStoppedBeforeStop { + moveOp.Check(self.ID, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: oldEpoch, + }) + require.Equal(t, moveMaintainerStateOriginStopped, moveOp.state) + } + + stopOp := oc.StopChangefeed(context.Background(), cfID, false) + require.Equal(t, "stop", stopOp.Type()) + reqMsg := stopOp.Schedule() + require.Equal(t, self.ID, reqMsg.To) + req := reqMsg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, oldEpoch, req.MaintainerEpoch) + + stopOp.Check(self.ID, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: newEpoch, + }) + require.False(t, stopOp.IsFinished()) + + stopOp.Check(self.ID, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: oldEpoch, + }) + require.True(t, stopOp.IsFinished()) + }) + } +} + +func TestController_AddOperatorEpochBumpDoesNotBlockStatusAndStop(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + ctrl := gomock.NewController(t) + backend := mock_changefeed.NewMockBackend(ctrl) + oc, _, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, &operatorEpochPDClient{physical: 100, logical: 1}) + target := node.NewInfo("localhost:8301", "") + nodeManager.GetAliveNodes()[target.ID] = target + + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 10, + }, 1, true) + changefeedDB.AddAbsentChangefeed(cf) + + bumpStarted := make(chan struct{}) + bumpHasDeadline := make(chan bool, 1) + unblockBump := make(chan struct{}) + var unblockOnce sync.Once + unblock := func() { + unblockOnce.Do(func() { + close(unblockBump) + }) + } + t.Cleanup(unblock) + + backend.EXPECT(). + BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), changefeed.EpochBumpOptions{}). + DoAndReturn(func(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + _, ok := ctx.Deadline() + bumpHasDeadline <- ok + close(bumpStarted) + select { + case <-unblockBump: + info, err := cf.GetInfo().Clone() + if err != nil { + return nil, err + } + info.Epoch = candidateEpoch + return info, nil + case <-ctx.Done(): + return nil, ctx.Err() + } + }). + Times(1) + + addDone := make(chan bool, 1) + go func() { + addDone <- oc.AddOperator(NewAddMaintainerOperator(changefeedDB, cf, target.ID)) + }() + + select { + case <-bumpStarted: + case <-time.After(time.Second): + t.Fatal("AddOperator did not reach epoch bump") + } + require.True(t, <-bumpHasDeadline) + + statusDone := make(chan struct{}, 1) + go func() { + oc.UpdateOperatorStatus(cfID, target.ID, &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Working, + }) + statusDone <- struct{}{} + }() + select { + case <-statusDone: + case <-time.After(time.Second): + t.Fatal("UpdateOperatorStatus blocked behind AddOperator epoch bump") + } + + stopDone := make(chan scheduleroperator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus], 1) + go func() { + stopDone <- oc.StopChangefeed(context.Background(), cfID, true) + }() + var stopOp scheduleroperator.Operator[common.ChangeFeedID, *heartbeatpb.MaintainerStatus] + select { + case stopOp = <-stopDone: + case <-time.After(time.Second): + t.Fatal("StopChangefeed blocked behind AddOperator epoch bump") + } + require.NotNil(t, stopOp) + require.Equal(t, "stop", stopOp.Type()) + require.Equal(t, 1, oc.OperatorSize()) + + unblock() + select { + case added := <-addDone: + require.False(t, added) + case <-time.After(time.Second): + t.Fatal("AddOperator did not finish after epoch bump was unblocked") + } + require.Equal(t, 1, oc.OperatorSize()) + require.Equal(t, "stop", oc.GetOperator(cfID).Type()) +} + func TestController_StopChangefeedDuringAddOperator(t *testing.T) { // Setup test environment changefeedDB := changefeed.NewChangefeedDB(1216) ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - oc, _, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend) + oc, _, nodeManager := newOperatorControllerForTest(t, changefeedDB, backend, nil) target := node.NewInfo("localhost:8301", "") nodeManager.GetAliveNodes()[target.ID] = target diff --git a/coordinator/operator/operator_move.go b/coordinator/operator/operator_move.go index 9c15c936af..cb91ed1ac4 100644 --- a/coordinator/operator/operator_move.go +++ b/coordinator/operator/operator_move.go @@ -26,33 +26,49 @@ import ( "go.uber.org/zap" ) +// moveMaintainerState describes the origin-first handoff flow for a maintainer move. +type moveMaintainerState int + +const ( + moveMaintainerStateRemoveOrigin moveMaintainerState = iota + moveMaintainerStateOriginStopped + moveMaintainerStateAddTarget + moveMaintainerStateDoneSuccess + moveMaintainerStateDoneNoPostFinish +) + // MoveMaintainerOperator is an operator to move a maintainer to the destination node type MoveMaintainerOperator struct { changefeed *changefeed.Changefeed db *changefeed.ChangefeedDB origin node.ID - dest node.ID - - originNodeStopped bool - finished bool - bind bool + target node.ID - canceled bool + originMaintainerEpoch uint64 + state moveMaintainerState + // originRemoved records node-liveness removal for the origin node. The move + // may still be waiting on target removal when this flag is set. + originRemoved bool lck sync.Mutex } +// NewMoveMaintainerOperator creates an operator that moves a maintainer from origin to dest. func NewMoveMaintainerOperator(db *changefeed.ChangefeedDB, changefeed *changefeed.Changefeed, origin, dest node.ID, ) *MoveMaintainerOperator { return &MoveMaintainerOperator{ changefeed: changefeed, origin: origin, - dest: dest, + target: dest, db: db, + // The move first removes the origin maintainer and then adds the + // destination. The remove must use the epoch the origin already owns. + originMaintainerEpoch: changefeed.GetInfo().Epoch, } } +// Check observes maintainer status and advances the move handoff state. func (m *MoveMaintainerOperator) Check(from node.ID, status *heartbeatpb.MaintainerStatus) { m.lck.Lock() defer m.lck.Unlock() @@ -60,88 +76,155 @@ func (m *MoveMaintainerOperator) Check(from node.ID, status *heartbeatpb.Maintai if status == nil { return } + if m.isFinishedLocked() { + return + } - if from == m.origin && status.State != heartbeatpb.ComponentState_Working { - log.Info("changefeed changefeedIsRemoved from origin node", + if from == m.origin && + m.state == moveMaintainerStateRemoveOrigin && + common.MaintainerEpochMatches(status.MaintainerEpoch, m.originMaintainerEpoch) && + status.State != heartbeatpb.ComponentState_Working { + log.Info("changefeed removed from origin node", zap.String("changefeed", m.changefeed.ID.String())) - m.originNodeStopped = true + m.state = moveMaintainerStateOriginStopped } - if m.originNodeStopped && from == m.dest && + if m.state == moveMaintainerStateAddTarget && + from == m.target && + common.MaintainerEpochMatches(status.MaintainerEpoch, m.changefeed.GetInfo().Epoch) && status.State == heartbeatpb.ComponentState_Working && status.BootstrapDone { log.Info("changefeed added to dest node", - zap.String("dest", m.dest.String()), + zap.String("dest", m.target.String()), zap.String("changefeed", m.changefeed.ID.String())) - m.finished = true + m.state = moveMaintainerStateDoneSuccess } } +// Schedule returns the next remove or add command needed by the current move state. func (m *MoveMaintainerOperator) Schedule() *messaging.TargetMessage { m.lck.Lock() defer m.lck.Unlock() - if m.finished || m.canceled { + if m.isFinishedLocked() { return nil } - if m.originNodeStopped { - if !m.bind { - m.db.BindChangefeedToNode(m.origin, m.dest, m.changefeed) - m.bind = true - } - return m.changefeed.NewAddMaintainerMessage(m.dest) + switch m.state { + case moveMaintainerStateOriginStopped: + m.enterAddTargetLocked() + return m.changefeed.NewAddMaintainerMessage(m.target) + case moveMaintainerStateAddTarget: + return m.changefeed.NewAddMaintainerMessage(m.target) + case moveMaintainerStateRemoveOrigin: + return changefeed.RemoveMaintainerMessage( + m.changefeed.GetKeyspaceID(), + m.changefeed.ID, + m.origin, + false, + false, + m.originMaintainerEpoch, + ) + default: + return nil } - return m.changefeed.NewRemoveMaintainerMessage(m.origin, false, false) } +// OnNodeRemove updates the move when either the origin or target node goes offline. func (m *MoveMaintainerOperator) OnNodeRemove(n node.ID) { m.lck.Lock() defer m.lck.Unlock() - if m.canceled { + if m.state == moveMaintainerStateDoneNoPostFinish { return } - if n == m.dest { - // Node removal must win over a just-finished move. Otherwise PostFinish can still - // mark the changefeed replicating on a node that has already been removed. - if m.originNodeStopped { - log.Info("dest node is stopped, mark changefeed absent", - zap.String("changefeed", m.changefeed.ID.String()), - zap.String("dest", m.dest.String())) - m.db.MarkMaintainerAbsent(m.changefeed) - m.canceled = true - return - } - - log.Info("changefeed changefeed is removed from dest node", - zap.String("dest", m.dest.String()), - zap.String("origin", m.origin.String()), - zap.String("changefeed", m.changefeed.ID.String())) - // here we translate the move to an add operation, so we need to swap the origin and dest - // we need to reset the origin node finished flag - m.dest = m.origin - m.db.BindChangefeedToNode(m.dest, m.origin, m.changefeed) - m.bind = true - m.originNodeStopped = true - return + if n == m.origin { + m.originRemoved = true } - if m.finished { + if n == m.target { + m.onTargetNodeRemovedLocked() return } if n == m.origin { log.Info("origin node is stopped", zap.String("origin", m.origin.String()), zap.String("changefeed", m.changefeed.ID.String())) - m.originNodeStopped = true + switch m.state { + case moveMaintainerStateRemoveOrigin: + m.state = moveMaintainerStateOriginStopped + case moveMaintainerStateOriginStopped: + } + } +} + +// onTargetNodeRemovedLocked handles target loss without creating two live owners. +func (m *MoveMaintainerOperator) onTargetNodeRemovedLocked() { + switch m.state { + case moveMaintainerStateRemoveOrigin: + if m.target == m.origin { + m.finishAsAbsentLocked() + return + } + log.Info("destination node removed before origin maintainer stopped", + zap.String("dest", m.target.String()), + zap.String("origin", m.origin.String()), + zap.String("changefeed", m.changefeed.ID.String())) + // Keep removing the old origin maintainer first. The new owner can only + // be added back to origin after the old epoch reports stopped. + m.target = m.origin + case moveMaintainerStateOriginStopped: + if m.target == m.origin { + m.finishAsAbsentLocked() + return + } + log.Info("destination node removed after origin maintainer stopped", + zap.String("dest", m.target.String()), + zap.String("origin", m.origin.String()), + zap.String("changefeed", m.changefeed.ID.String())) + if m.originRemoved { + m.finishAsAbsentLocked() + return + } + m.target = m.origin + case moveMaintainerStateAddTarget, moveMaintainerStateDoneSuccess: + // Once the add request may have reached target, rebinding to origin can + // create two new-epoch maintainers. Mark absent and let scheduler retry + // with a fresh ownership epoch. + m.finishAsAbsentLocked() } } +// enterAddTargetLocked binds the changefeed to the target before sending add requests. +func (m *MoveMaintainerOperator) enterAddTargetLocked() { + m.db.BindChangefeedToNode(m.origin, m.target, m.changefeed) + m.state = moveMaintainerStateAddTarget +} + +// finishAsAbsentLocked finishes the move without PostFinish side effects. +func (m *MoveMaintainerOperator) finishAsAbsentLocked() { + log.Info("move maintainer operator aborted, mark changefeed absent", + zap.String("changefeed", m.changefeed.ID.String()), + zap.String("origin", m.origin.String()), + zap.String("target", m.target.String())) + m.db.MarkMaintainerAbsent(m.changefeed) + m.state = moveMaintainerStateDoneNoPostFinish +} + +func (m *MoveMaintainerOperator) isFinishedLocked() bool { + return m.state == moveMaintainerStateDoneSuccess || m.state == moveMaintainerStateDoneNoPostFinish +} + +func (m *MoveMaintainerOperator) isOriginStopTargetLocked() bool { + return m.state == moveMaintainerStateRemoveOrigin || + m.state == moveMaintainerStateOriginStopped +} + +// AffectedNodes returns the origin and current target nodes touched by this move. func (m *MoveMaintainerOperator) AffectedNodes() []node.ID { m.lck.Lock() defer m.lck.Unlock() - return []node.ID{m.origin, m.dest} + return []node.ID{m.origin, m.target} } // OriginNode returns the source node of the move. @@ -152,38 +235,60 @@ func (m *MoveMaintainerOperator) OriginNode() node.ID { return m.origin } +// originStopTarget returns the origin maintainer until the destination owner +// has been bound. The origin keeps fencing close requests by the epoch it +// already owns, even after it has reported stopped, because no new owner exists +// before the move sends the destination add request. +func (m *MoveMaintainerOperator) originStopTarget() (node.ID, uint64, bool) { + m.lck.Lock() + defer m.lck.Unlock() + + if !m.isOriginStopTargetLocked() { + return "", 0, false + } + return m.origin, m.originMaintainerEpoch, true +} + +// ID returns the changefeed ID this operator works on. func (m *MoveMaintainerOperator) ID() common.ChangeFeedID { return m.changefeed.ID } +// IsFinished reports whether the move has reached a terminal state. func (m *MoveMaintainerOperator) IsFinished() bool { m.lck.Lock() defer m.lck.Unlock() - return m.finished || m.canceled + return m.isFinishedLocked() } +// OnTaskRemoved stops the move when the changefeed task is removed. func (m *MoveMaintainerOperator) OnTaskRemoved() { m.lck.Lock() defer m.lck.Unlock() - log.Info("changefeed is changefeedIsRemoved, mark move changefeed operator finished", + log.Info("changefeed removed, mark move changefeed operator finished", zap.String("changefeed", m.changefeed.ID.String())) - m.canceled = true + m.state = moveMaintainerStateDoneNoPostFinish } +// Start marks the changefeed as scheduling before the first move command is sent. func (m *MoveMaintainerOperator) Start() { m.lck.Lock() defer m.lck.Unlock() + if m.isFinishedLocked() { + return + } m.db.MarkMaintainerScheduling(m.changefeed) } +// PostFinish marks a successfully moved maintainer as replicating. func (m *MoveMaintainerOperator) PostFinish() { m.lck.Lock() defer m.lck.Unlock() - if m.canceled { + if m.state != moveMaintainerStateDoneSuccess { return } @@ -192,18 +297,21 @@ func (m *MoveMaintainerOperator) PostFinish() { m.db.MarkMaintainerReplicating(m.changefeed) } +// String returns a human-readable description of the operator. func (m *MoveMaintainerOperator) String() string { m.lck.Lock() defer m.lck.Unlock() return fmt.Sprintf("move maintainer operator: %s, origin:%s, dest:%s", - m.changefeed.ID, m.origin, m.dest) + m.changefeed.ID, m.origin, m.target) } +// Type returns the operator type used by metrics and logs. func (m *MoveMaintainerOperator) Type() string { return "move" } +// BlockTsForward indicates whether this operator blocks changefeed checkpoint forwarding. func (m *MoveMaintainerOperator) BlockTsForward() bool { log.Panic("unreachable code") return false diff --git a/coordinator/operator/operator_move_test.go b/coordinator/operator/operator_move_test.go index 4b9d4ec3cb..21f5ef03cd 100644 --- a/coordinator/operator/operator_move_test.go +++ b/coordinator/operator/operator_move_test.go @@ -25,53 +25,134 @@ import ( ) func TestMoveMaintainerOperator_OnNodeRemove(t *testing.T) { + t.Run("dest removed before origin stopped keeps removing origin", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "dest-before-origin-stopped") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.OnNodeRemove("n2") + require.Equal(t, moveMaintainerStateRemoveOrigin, op.state) + require.Equal(t, node.ID("n1"), op.target) + + msg := op.Schedule() + require.Equal(t, node.ID("n1"), msg.To) + removeReq := msg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + require.Equal(t, cf.GetInfo().Epoch, removeReq.MaintainerEpoch) + require.Len(t, changefeedDB.GetByNodeID("n1"), 1) + }) + + t.Run("dest removed after origin stopped adds back to origin", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "dest-after-origin-stopped") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.Check("n1", &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: cf.GetInfo().Epoch, + }) + require.Equal(t, moveMaintainerStateOriginStopped, op.state) + + op.OnNodeRemove("n2") + require.Equal(t, moveMaintainerStateOriginStopped, op.state) + require.Equal(t, node.ID("n1"), op.target) + + msg := op.Schedule() + require.Equal(t, node.ID("n1"), msg.To) + require.NotNil(t, msg.Message[0].(*heartbeatpb.AddMaintainerRequest)) + require.Equal(t, moveMaintainerStateAddTarget, op.state) + require.Len(t, changefeedDB.GetByNodeID("n1"), 1) + }) + + t.Run("dest removed after add starts marks absent", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "dest-after-add-starts") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.Check("n1", &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: cf.GetInfo().Epoch, + }) + require.NotNil(t, op.Schedule()) + require.Equal(t, moveMaintainerStateAddTarget, op.state) + require.Equal(t, node.ID("n2"), cf.GetNodeID()) + + op.OnNodeRemove("n2") + require.True(t, op.IsFinished()) + require.Equal(t, 1, changefeedDB.GetAbsentSize()) + require.Len(t, changefeedDB.GetByNodeID("n2"), 0) + require.Nil(t, op.Schedule()) + + op.PostFinish() + require.Equal(t, 1, changefeedDB.GetAbsentSize()) + require.Len(t, changefeedDB.GetByNodeID("n2"), 0) + }) + + t.Run("origin removed falls through to destination add", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "origin-removed") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.OnNodeRemove("n1") + require.Equal(t, moveMaintainerStateOriginStopped, op.state) + + msg := op.Schedule() + require.Equal(t, node.ID("n2"), msg.To) + require.NotNil(t, msg.Message[0].(*heartbeatpb.AddMaintainerRequest)) + require.Equal(t, moveMaintainerStateAddTarget, op.state) + require.Len(t, changefeedDB.GetByNodeID("n2"), 1) + }) + + t.Run("fallback origin removed marks absent", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "fallback-origin-removed") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.OnNodeRemove("n2") + op.OnNodeRemove("n1") + + require.True(t, op.IsFinished()) + require.Equal(t, 1, changefeedDB.GetAbsentSize()) + require.Len(t, changefeedDB.GetByNodeID("n1"), 0) + require.Nil(t, op.Schedule()) + }) + + t.Run("origin removed before fallback target marks absent", func(t *testing.T) { + changefeedDB, cf := newMoveMaintainerTestChangefeed(t, "origin-before-fallback-target") + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + + op.OnNodeRemove("n1") + require.Equal(t, moveMaintainerStateOriginStopped, op.state) + + op.OnNodeRemove("n2") + require.True(t, op.IsFinished()) + require.Equal(t, 1, changefeedDB.GetAbsentSize()) + require.Len(t, changefeedDB.GetByNodeID("n1"), 0) + require.Len(t, changefeedDB.GetByNodeID("n2"), 0) + require.Nil(t, op.Schedule()) + }) +} + +func newMoveMaintainerTestChangefeed(t *testing.T, name string) (*changefeed.ChangefeedDB, *changefeed.Changefeed) { + t.Helper() + changefeedDB := changefeed.NewChangefeedDB(1216) - cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cfID := common.NewChangeFeedIDWithName(name, common.DefaultKeyspaceName) cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ ChangefeedID: cfID, Config: config.GetDefaultReplicaConfig(), SinkURI: "mysql://127.0.0.1:3306", - }, - 1, true) + Epoch: 1, + }, 1, true) changefeedDB.AddReplicatingMaintainer(cf, "n1") + return changefeedDB, cf +} - op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") - op.OnNodeRemove("n2") - - require.True(t, op.bind) - require.True(t, op.originNodeStopped) - require.Equal(t, "n1", op.dest.String()) - require.Len(t, changefeedDB.GetByNodeID("n1"), 1) - req := op.Schedule().Message[0].(*heartbeatpb.AddMaintainerRequest) - require.NotNil(t, req) - require.Len(t, changefeedDB.GetByNodeID("n1"), 1) - - op.OnNodeRemove("n1") - require.Len(t, changefeedDB.GetByNodeID("n1"), 0) - require.Equal(t, 1, changefeedDB.GetAbsentSize()) - require.True(t, op.canceled) - require.Nil(t, op.Schedule()) - - cf2ID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - cf2 := changefeed.NewChangefeed(cf2ID, &config.ChangeFeedInfo{ - ChangefeedID: cf2ID, +func TestMoveMaintainerOperator_OnTaskRemoved(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, Config: config.GetDefaultReplicaConfig(), SinkURI: "mysql://127.0.0.1:3306", }, 1, true) - changefeedDB.AddReplicatingMaintainer(cf2, "n1") - op2 := NewMoveMaintainerOperator(changefeedDB, cf2, "n1", "n2") - op2.OnNodeRemove("n1") - require.True(t, op2.originNodeStopped) - op2.Schedule() - require.True(t, op2.bind) - require.Len(t, changefeedDB.GetByNodeID("n2"), 1) -} - -func TestMoveMaintainerOperator_OnTaskRemoved(t *testing.T) { - op := NewMoveMaintainerOperator(nil, &changefeed.Changefeed{}, "n1", "n2") + op := NewMoveMaintainerOperator(nil, cf, "n1", "n2") op.OnTaskRemoved() - require.True(t, op.canceled) + require.True(t, op.IsFinished()) require.Nil(t, op.Schedule()) // backend is nil, but op is canceled , no nil pointer error op.PostFinish() @@ -91,23 +172,51 @@ func TestMoveMaintainerOperator_CheckRequiresDestBootstrapDone(t *testing.T) { op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") op.Check("n1", &heartbeatpb.MaintainerStatus{State: heartbeatpb.ComponentState_Stopped}) - require.True(t, op.originNodeStopped) - require.False(t, op.finished) + require.Equal(t, moveMaintainerStateOriginStopped, op.state) + require.False(t, op.IsFinished()) + require.NotNil(t, op.Schedule()) op.Check("n2", &heartbeatpb.MaintainerStatus{ State: heartbeatpb.ComponentState_Working, BootstrapDone: false, }) - require.False(t, op.finished) + require.False(t, op.IsFinished()) op.Check("n2", &heartbeatpb.MaintainerStatus{ State: heartbeatpb.ComponentState_Working, BootstrapDone: true, }) - require.True(t, op.finished) + require.True(t, op.IsFinished()) require.Nil(t, op.Schedule()) } +func TestMoveMaintainerOperator_CheckAcceptsCompatDestEpochDuringRollingUpgrade(t *testing.T) { + changefeedDB := changefeed.NewChangefeedDB(1216) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + SinkURI: "mysql://127.0.0.1:3306", + Epoch: 2, + }, 1, true) + changefeedDB.AddReplicatingMaintainer(cf, "n1") + + op := NewMoveMaintainerOperator(changefeedDB, cf, "n1", "n2") + op.Check("n1", &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: 2, + }) + require.NotNil(t, op.Schedule()) + status := &heartbeatpb.MaintainerStatus{ + State: heartbeatpb.ComponentState_Working, + BootstrapDone: true, + MaintainerEpoch: 0, + } + + op.Check("n2", status) + require.True(t, op.IsFinished()) +} + func TestMoveMaintainerOperator_OnNodeRemoveAfterFinishMarksAbsent(t *testing.T) { changefeedDB := changefeed.NewChangefeedDB(1216) cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) @@ -128,10 +237,10 @@ func TestMoveMaintainerOperator_OnNodeRemoveAfterFinishMarksAbsent(t *testing.T) State: heartbeatpb.ComponentState_Working, BootstrapDone: true, }) - require.True(t, op.finished) + require.True(t, op.IsFinished()) op.OnNodeRemove("n2") - require.True(t, op.canceled) + require.True(t, op.IsFinished()) require.Equal(t, 1, changefeedDB.GetAbsentSize()) require.Len(t, changefeedDB.GetByNodeID("n2"), 0) diff --git a/coordinator/operator/operator_stop.go b/coordinator/operator/operator_stop.go index 7dd10185cf..248357bce7 100644 --- a/coordinator/operator/operator_stop.go +++ b/coordinator/operator/operator_stop.go @@ -30,13 +30,14 @@ import ( // StopChangefeedOperator is an operator to remove a maintainer from a node type StopChangefeedOperator struct { - keyspaceID uint32 - cfID common.ChangeFeedID - nodeID node.ID - changefeedIsRemoved bool - finished atomic.Bool - coordinatorNodeID node.ID - backend changefeed.Backend + keyspaceID uint32 + cfID common.ChangeFeedID + nodeID node.ID + changefeedRemoved bool + finished atomic.Bool + coordinatorNodeID node.ID + backend changefeed.Backend + maintainerEpoch uint64 } func NewStopChangefeedOperator( @@ -46,19 +47,24 @@ func NewStopChangefeedOperator( coordinatorNode node.ID, backend changefeed.Backend, removed bool, + maintainerEpoch uint64, ) *StopChangefeedOperator { return &StopChangefeedOperator{ - keyspaceID: keyspaceID, - cfID: cfID, - nodeID: nodeID, - changefeedIsRemoved: removed, - coordinatorNodeID: coordinatorNode, - backend: backend, + keyspaceID: keyspaceID, + cfID: cfID, + nodeID: nodeID, + changefeedRemoved: removed, + coordinatorNodeID: coordinatorNode, + backend: backend, + maintainerEpoch: maintainerEpoch, } } -func (m *StopChangefeedOperator) Check(_ node.ID, status *heartbeatpb.MaintainerStatus) { - if !m.finished.Load() && status.State != heartbeatpb.ComponentState_Working { +func (m *StopChangefeedOperator) Check(from node.ID, status *heartbeatpb.MaintainerStatus) { + if !m.finished.Load() && + from == m.nodeID && + common.MaintainerEpochMatches(status.MaintainerEpoch, m.maintainerEpoch) && + status.State != heartbeatpb.ComponentState_Working { log.Info("maintainer report non-working status", zap.Stringer("maintainer", m.cfID)) m.finished.Store(true) @@ -66,7 +72,14 @@ func (m *StopChangefeedOperator) Check(_ node.ID, status *heartbeatpb.Maintainer } func (m *StopChangefeedOperator) Schedule() *messaging.TargetMessage { - return changefeed.RemoveMaintainerMessage(m.keyspaceID, m.cfID, m.nodeID, true, m.changefeedIsRemoved) + return changefeed.RemoveMaintainerMessage( + m.keyspaceID, + m.cfID, + m.nodeID, + true, + m.changefeedRemoved, + m.maintainerEpoch, + ) } // OnNodeRemove is called when node offline, and the maintainer must already move to absent status and will be scheduled again @@ -101,7 +114,7 @@ func (m *StopChangefeedOperator) Start() { } func (m *StopChangefeedOperator) PostFinish() { - if m.changefeedIsRemoved { + if m.changefeedRemoved { if err := m.backend.DeleteChangefeed(context.Background(), m.cfID); err != nil { log.Warn("failed to delete changefeed", zap.Stringer("changefeed", m.cfID), @@ -124,7 +137,7 @@ func (m *StopChangefeedOperator) PostFinish() { func (m *StopChangefeedOperator) String() string { return fmt.Sprintf("stop maintainer operator: %s, dest %s, remove %t", - m.cfID, m.nodeID, m.changefeedIsRemoved) + m.cfID, m.nodeID, m.changefeedRemoved) } func (m *StopChangefeedOperator) Type() string { diff --git a/coordinator/operator/operator_stop_test.go b/coordinator/operator/operator_stop_test.go index a7002887fb..399d272476 100644 --- a/coordinator/operator/operator_stop_test.go +++ b/coordinator/operator/operator_stop_test.go @@ -38,7 +38,7 @@ func TestStopChangefeedOperator_OnNodeRemove(t *testing.T) { ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, true) + op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, true, 10) op.OnNodeRemove("n1") require.Equal(t, "n2", op.nodeID.String()) require.False(t, op.finished.Load()) @@ -54,7 +54,7 @@ func TestStopChangefeedOperator_OnTaskRemoved(t *testing.T) { }, 1, true) changefeedDB.AddReplicatingMaintainer(cf, "n1") - op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", nil, true) + op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", nil, true, 10) op.OnTaskRemoved() require.True(t, op.finished.Load()) } @@ -72,11 +72,11 @@ func TestStopChangefeedOperator_PostFinish(t *testing.T) { ctrl := gomock.NewController(t) backend := mock_changefeed.NewMockBackend(ctrl) - op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, true) + op := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, true, 10) backend.EXPECT().DeleteChangefeed(gomock.Any(), cfID).Return(errors.New("err")) op.PostFinish() - op2 := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, false) + op2 := NewStopChangefeedOperator(common.DefaultKeyspaceID, cfID, "n1", "n2", backend, false, 10) backend.EXPECT().SetChangefeedProgress(gomock.Any(), cfID, config.ProgressNone).Return(errors.New("err")) op2.PostFinish() } diff --git a/coordinator/scheduler/balance_test.go b/coordinator/scheduler/balance_test.go index 7047331f97..57f108c2f1 100644 --- a/coordinator/scheduler/balance_test.go +++ b/coordinator/scheduler/balance_test.go @@ -47,7 +47,7 @@ func TestBalanceSchedulerCreatesMoveOperators(t *testing.T) { addReplicatingMaintainer(t, db, "cf-a-2", nodeA) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBalanceScheduler("test", 10, oc, db, 0, drainController) _ = s.Execute() @@ -80,7 +80,7 @@ func TestBalanceSchedulerSkipsWhenDrainActive(t *testing.T) { addReplicatingMaintainer(t, db, "cf-a-4", nodeA) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBalanceScheduler("test", 10, oc, db, 0, drainController) _ = s.Execute() @@ -118,7 +118,7 @@ func TestBalanceSchedulerSkipsUntilObservedDrainBlockWindowExpires(t *testing.T) addReplicatingMaintainer(t, db, "cf-a-4", nodeA) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBalanceScheduler("test", 10, oc, db, 0, drainController) s.drainBalanceBlockedUntil = time.Time{} @@ -171,7 +171,7 @@ func TestBalanceSchedulerUsesBalanceIntervalAsDrainBlockWindow(t *testing.T) { addReplicatingMaintainer(t, db, "cf-a-2", nodeA) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) interval := 200 * time.Millisecond s := NewBalanceScheduler("test", 10, oc, db, interval, drainController) @@ -198,7 +198,7 @@ func TestBalanceSchedulerSkipsWhenSchedulingFrozen(t *testing.T) { addReplicatingMaintainer(t, db, "cf-a-2", nodeA) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBalanceScheduler("test", 10, oc, db, 0, drainController) _ = s.Execute() diff --git a/coordinator/scheduler/basic_test.go b/coordinator/scheduler/basic_test.go index 9df52468c6..064cde6b48 100644 --- a/coordinator/scheduler/basic_test.go +++ b/coordinator/scheduler/basic_test.go @@ -59,7 +59,7 @@ func TestBasicSchedulerRequiresTargetAckBeforeUsingDestination(t *testing.T) { cfID := addAbsentChangefeed(t, db, "cf-absent") selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBasicScheduler("test", 10, oc, db, drainController) _ = s.Execute() @@ -85,7 +85,7 @@ func TestBasicSchedulerSkipsWhenSchedulingFrozen(t *testing.T) { cfID := addAbsentChangefeed(t, db, "cf-frozen") selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewBasicScheduler("test", 10, oc, db, drainController) _ = s.Execute() diff --git a/coordinator/scheduler/drain_test.go b/coordinator/scheduler/drain_test.go index 8286cd2c70..bd91240c61 100644 --- a/coordinator/scheduler/drain_test.go +++ b/coordinator/scheduler/drain_test.go @@ -73,7 +73,7 @@ func TestDrainSchedulerCreatesMoveOperators(t *testing.T) { db.AddReplicatingMaintainer(changefeed.NewChangefeed(loadInfo2.ChangefeedID, &loadInfo2, 1, false), destHot) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewDrainScheduler("test", 10, oc, db, drainController) _ = s.Execute() @@ -123,7 +123,7 @@ func TestDrainSchedulerSkipsChangefeedWithInflightOperator(t *testing.T) { db.AddReplicatingMaintainer(cf2, origin) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) require.True(t, oc.AddOperator(operator.NewMoveMaintainerOperator(db, cf1, origin, dest))) require.Equal(t, 1, oc.OperatorSize()) @@ -161,7 +161,7 @@ func TestDrainSchedulerIgnoresUnrelatedOperatorCapacity(t *testing.T) { otherCF := db.GetByID(otherID) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) require.True(t, oc.AddOperator(operator.NewMoveMaintainerOperator(db, otherCF, other, dest))) s := NewDrainScheduler("test", 1, oc, db, drainController) @@ -200,7 +200,7 @@ func TestDrainSchedulerRotatesAcrossDrainingNodes(t *testing.T) { cfB := addReplicatingMaintainer(t, db, "cf-b", originB) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewDrainScheduler("test", 1, oc, db, drainController) _ = s.Execute() @@ -252,7 +252,7 @@ func TestDrainSchedulerRequiresTargetAckBeforeUsingDestination(t *testing.T) { cfID := addReplicatingMaintainer(t, db, "cf-drain", origin) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewDrainScheduler("test", 10, oc, db, drainController) _ = s.Execute() @@ -283,7 +283,7 @@ func TestDrainSchedulerSkipsWhenSchedulingFrozen(t *testing.T) { cfID := addReplicatingMaintainer(t, db, "cf-frozen-drain", origin) selfNode := &node.Info{ID: node.ID("coordinator")} - oc := operator.NewOperatorController(selfNode, db, nil, 10) + oc := operator.NewOperatorController(selfNode, db, nil, nil, 10) s := NewDrainScheduler("test", 10, oc, db, drainController) _ = s.Execute() diff --git a/heartbeatpb/heartbeat.pb.go b/heartbeatpb/heartbeat.pb.go index 686aa2ab6e..6b2c19f3cd 100644 --- a/heartbeatpb/heartbeat.pb.go +++ b/heartbeatpb/heartbeat.pb.go @@ -841,6 +841,8 @@ type HeartBeatResponse struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` DispatcherStatuses []*DispatcherStatus `protobuf:"bytes,2,rep,name=dispatcherStatuses,proto3" json:"dispatcherStatuses,omitempty"` Mode int64 `protobuf:"varint,3,opt,name=mode,proto3" json:"mode,omitempty"` + // maintainer_epoch fences barrier decisions from stale maintainers. + MaintainerEpoch uint64 `protobuf:"varint,4,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *HeartBeatResponse) Reset() { *m = HeartBeatResponse{} } @@ -897,6 +899,13 @@ func (m *HeartBeatResponse) GetMode() int64 { return 0 } +func (m *HeartBeatResponse) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type CheckpointTsMessage struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` CheckpointTs uint64 `protobuf:"varint,2,opt,name=checkpointTs,proto3" json:"checkpointTs,omitempty"` @@ -1208,10 +1217,11 @@ func (m *DispatcherConfig) GetSkipDMLAsStartTs() bool { } type ScheduleDispatcherRequest struct { - ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` - Config *DispatcherConfig `protobuf:"bytes,2,opt,name=config,proto3" json:"config,omitempty"` - ScheduleAction ScheduleAction `protobuf:"varint,3,opt,name=scheduleAction,proto3,enum=heartbeatpb.ScheduleAction" json:"scheduleAction,omitempty"` - OperatorType OperatorType `protobuf:"varint,4,opt,name=operatorType,proto3,enum=heartbeatpb.OperatorType" json:"operatorType,omitempty"` + ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` + Config *DispatcherConfig `protobuf:"bytes,2,opt,name=config,proto3" json:"config,omitempty"` + ScheduleAction ScheduleAction `protobuf:"varint,3,opt,name=scheduleAction,proto3,enum=heartbeatpb.ScheduleAction" json:"scheduleAction,omitempty"` + OperatorType OperatorType `protobuf:"varint,4,opt,name=operatorType,proto3,enum=heartbeatpb.OperatorType" json:"operatorType,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,5,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *ScheduleDispatcherRequest) Reset() { *m = ScheduleDispatcherRequest{} } @@ -1275,11 +1285,19 @@ func (m *ScheduleDispatcherRequest) GetOperatorType() OperatorType { return OperatorType_O_Add } +func (m *ScheduleDispatcherRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MergeDispatcherRequest struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` DispatcherIDs []*DispatcherID `protobuf:"bytes,2,rep,name=dispatcherIDs,proto3" json:"dispatcherIDs,omitempty"` MergedDispatcherID *DispatcherID `protobuf:"bytes,3,opt,name=mergedDispatcherID,proto3" json:"mergedDispatcherID,omitempty"` Mode int64 `protobuf:"varint,4,opt,name=mode,proto3" json:"mode,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,5,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MergeDispatcherRequest) Reset() { *m = MergeDispatcherRequest{} } @@ -1343,6 +1361,13 @@ func (m *MergeDispatcherRequest) GetMode() int64 { return 0 } +func (m *MergeDispatcherRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerHeartbeat struct { Statuses []*MaintainerStatus `protobuf:"bytes,1,rep,name=statuses,proto3" json:"statuses,omitempty"` } @@ -1474,7 +1499,8 @@ type MaintainerStatus struct { LastSyncedTs uint64 `protobuf:"varint,7,opt,name=lastSyncedTs,proto3" json:"lastSyncedTs,omitempty"` // drain_progress reports the active dispatcher drain target observed by this maintainer. // Nil means no active dispatcher drain target. - DrainProgress *DrainProgress `protobuf:"bytes,8,opt,name=drain_progress,json=drainProgress,proto3" json:"drain_progress,omitempty"` + DrainProgress *DrainProgress `protobuf:"bytes,8,opt,name=drain_progress,json=drainProgress,proto3" json:"drain_progress,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,9,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerStatus) Reset() { *m = MaintainerStatus{} } @@ -1566,6 +1592,13 @@ func (m *MaintainerStatus) GetDrainProgress() *DrainProgress { return nil } +func (m *MaintainerStatus) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + // NodeHeartbeat is sent periodically from a node to the coordinator. type NodeHeartbeat struct { Liveness NodeLiveness `protobuf:"varint,1,opt,name=liveness,proto3,enum=heartbeatpb.NodeLiveness" json:"liveness,omitempty"` @@ -1923,6 +1956,7 @@ type AddMaintainerRequest struct { CheckpointTs uint64 `protobuf:"varint,3,opt,name=checkpoint_ts,json=checkpointTs,proto3" json:"checkpoint_ts,omitempty"` IsNewChangefeed bool `protobuf:"varint,4,opt,name=is_new_changefeed,json=isNewChangefeed,proto3" json:"is_new_changefeed,omitempty"` KeyspaceId uint32 `protobuf:"varint,5,opt,name=keyspace_id,json=keyspaceId,proto3" json:"keyspace_id,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,6,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *AddMaintainerRequest) Reset() { *m = AddMaintainerRequest{} } @@ -1993,11 +2027,19 @@ func (m *AddMaintainerRequest) GetKeyspaceId() uint32 { return 0 } +func (m *AddMaintainerRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type RemoveMaintainerRequest struct { - Id *ChangefeedID `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Cascade bool `protobuf:"varint,2,opt,name=cascade,proto3" json:"cascade,omitempty"` - Removed bool `protobuf:"varint,3,opt,name=removed,proto3" json:"removed,omitempty"` - KeyspaceId uint32 `protobuf:"varint,4,opt,name=keyspace_id,json=keyspaceId,proto3" json:"keyspace_id,omitempty"` + Id *ChangefeedID `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Cascade bool `protobuf:"varint,2,opt,name=cascade,proto3" json:"cascade,omitempty"` + Removed bool `protobuf:"varint,3,opt,name=removed,proto3" json:"removed,omitempty"` + KeyspaceId uint32 `protobuf:"varint,4,opt,name=keyspace_id,json=keyspaceId,proto3" json:"keyspace_id,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,5,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *RemoveMaintainerRequest) Reset() { *m = RemoveMaintainerRequest{} } @@ -2061,6 +2103,13 @@ func (m *RemoveMaintainerRequest) GetKeyspaceId() uint32 { return 0 } +func (m *RemoveMaintainerRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerBootstrapRequest struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` Config []byte `protobuf:"bytes,2,opt,name=config,proto3" json:"config,omitempty"` @@ -2069,6 +2118,7 @@ type MaintainerBootstrapRequest struct { IsNewChangefeed bool `protobuf:"varint,5,opt,name=is_new_changefeed,json=isNewChangefeed,proto3" json:"is_new_changefeed,omitempty"` TableTriggerRedoDispatcherId *DispatcherID `protobuf:"bytes,6,opt,name=table_trigger_redo_dispatcher_id,json=tableTriggerRedoDispatcherId,proto3" json:"table_trigger_redo_dispatcher_id,omitempty"` KeyspaceId uint32 `protobuf:"varint,7,opt,name=keyspace_id,json=keyspaceId,proto3" json:"keyspace_id,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,8,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerBootstrapRequest) Reset() { *m = MaintainerBootstrapRequest{} } @@ -2153,6 +2203,13 @@ func (m *MaintainerBootstrapRequest) GetKeyspaceId() uint32 { return 0 } +func (m *MaintainerBootstrapRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerBootstrapResponse struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` Spans []*BootstrapTableSpan `protobuf:"bytes,2,rep,name=spans,proto3" json:"spans,omitempty"` @@ -2170,6 +2227,7 @@ type MaintainerBootstrapResponse struct { // It will be used when redo enable. RedoCheckpointTs uint64 `protobuf:"varint,5,opt,name=redo_checkpoint_ts,json=redoCheckpointTs,proto3" json:"redo_checkpoint_ts,omitempty"` Operators []*ScheduleDispatcherRequest `protobuf:"bytes,6,rep,name=operators,proto3" json:"operators,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,7,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerBootstrapResponse) Reset() { *m = MaintainerBootstrapResponse{} } @@ -2247,11 +2305,19 @@ func (m *MaintainerBootstrapResponse) GetOperators() []*ScheduleDispatcherReques return nil } +func (m *MaintainerBootstrapResponse) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerPostBootstrapRequest struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` TableTriggerEventDispatcherId *DispatcherID `protobuf:"bytes,2,opt,name=table_trigger_event_dispatcher_id,json=tableTriggerEventDispatcherId,proto3" json:"table_trigger_event_dispatcher_id,omitempty"` Schemas []*SchemaInfo `protobuf:"bytes,3,rep,name=schemas,proto3" json:"schemas,omitempty"` RedoSchemas []*SchemaInfo `protobuf:"bytes,4,rep,name=redo_schemas,json=redoSchemas,proto3" json:"redo_schemas,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,5,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerPostBootstrapRequest) Reset() { *m = MaintainerPostBootstrapRequest{} } @@ -2315,10 +2381,18 @@ func (m *MaintainerPostBootstrapRequest) GetRedoSchemas() []*SchemaInfo { return nil } +func (m *MaintainerPostBootstrapRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerPostBootstrapResponse struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` TableTriggerEventDispatcherId *DispatcherID `protobuf:"bytes,2,opt,name=table_trigger_event_dispatcher_id,json=tableTriggerEventDispatcherId,proto3" json:"table_trigger_event_dispatcher_id,omitempty"` Err *RunningError `protobuf:"bytes,3,opt,name=err,proto3" json:"err,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,4,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerPostBootstrapResponse) Reset() { *m = MaintainerPostBootstrapResponse{} } @@ -2375,6 +2449,13 @@ func (m *MaintainerPostBootstrapResponse) GetErr() *RunningError { return nil } +func (m *MaintainerPostBootstrapResponse) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type SchemaInfo struct { SchemaID int64 `protobuf:"varint,1,opt,name=SchemaID,proto3" json:"SchemaID,omitempty"` SchemaName string `protobuf:"bytes,2,opt,name=SchemaName,proto3" json:"SchemaName,omitempty"` @@ -2582,7 +2663,8 @@ func (m *BootstrapTableSpan) GetMode() int64 { type MaintainerCloseRequest struct { ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` // true when remove changefeed, false when pause the changefeed. - Removed bool `protobuf:"varint,2,opt,name=removed,proto3" json:"removed,omitempty"` + Removed bool `protobuf:"varint,2,opt,name=removed,proto3" json:"removed,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,3,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerCloseRequest) Reset() { *m = MaintainerCloseRequest{} } @@ -2632,9 +2714,17 @@ func (m *MaintainerCloseRequest) GetRemoved() bool { return false } +func (m *MaintainerCloseRequest) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type MaintainerCloseResponse struct { - ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` - Success bool `protobuf:"varint,2,opt,name=success,proto3" json:"success,omitempty"` + ChangefeedID *ChangefeedID `protobuf:"bytes,1,opt,name=changefeedID,proto3" json:"changefeedID,omitempty"` + Success bool `protobuf:"varint,2,opt,name=success,proto3" json:"success,omitempty"` + MaintainerEpoch uint64 `protobuf:"varint,3,opt,name=maintainer_epoch,json=maintainerEpoch,proto3" json:"maintainer_epoch,omitempty"` } func (m *MaintainerCloseResponse) Reset() { *m = MaintainerCloseResponse{} } @@ -2684,6 +2774,13 @@ func (m *MaintainerCloseResponse) GetSuccess() bool { return false } +func (m *MaintainerCloseResponse) GetMaintainerEpoch() uint64 { + if m != nil { + return m.MaintainerEpoch + } + return 0 +} + type InfluencedTables struct { InfluenceType InfluenceType `protobuf:"varint,1,opt,name=InfluenceType,proto3,enum=heartbeatpb.InfluenceType" json:"InfluenceType,omitempty"` // only exist when type is normal @@ -3882,190 +3979,194 @@ func init() { func init() { proto.RegisterFile("heartbeatpb/heartbeat.proto", fileDescriptor_6d584080fdadb670) } var fileDescriptor_6d584080fdadb670 = []byte{ - // 2926 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xcc, 0x3a, 0x4d, 0x6f, 0x1c, 0xc7, - 0xb1, 0x9a, 0xd9, 0x0f, 0x72, 0x6b, 0xb9, 0xd4, 0xa8, 0x25, 0x51, 0x94, 0x44, 0x51, 0xf4, 0x3c, - 0xbf, 0x07, 0xbe, 0xb5, 0x9f, 0xf4, 0x24, 0x5b, 0xf9, 0x70, 0x1c, 0x3b, 0xab, 0x5d, 0xda, 0x5a, - 0x88, 0x4b, 0x12, 0xbd, 0xb4, 0x15, 0x38, 0x87, 0xcd, 0x70, 0xa6, 0xb5, 0x1c, 0x73, 0x77, 0x7a, - 0x35, 0x33, 0x2b, 0x4a, 0x02, 0x9c, 0xc0, 0x30, 0x72, 0xcb, 0x21, 0x01, 0x12, 0x20, 0x39, 0xe4, - 0x94, 0x3f, 0x10, 0x20, 0xc8, 0x31, 0xf7, 0x00, 0xb9, 0xf8, 0x14, 0x38, 0xa7, 0x04, 0xf6, 0x3d, - 0x08, 0x92, 0x83, 0xaf, 0x41, 0x7f, 0xcc, 0x4c, 0xcf, 0x07, 0x49, 0x29, 0x5c, 0x18, 0x39, 0xed, - 0x54, 0x75, 0x55, 0x75, 0x75, 0x75, 0x75, 0x55, 0x75, 0xf5, 0xc2, 0xd5, 0x7d, 0x62, 0xf9, 0xe1, - 0x1e, 0xb1, 0xc2, 0xc9, 0xde, 0xcd, 0xf8, 0xfb, 0xc6, 0xc4, 0xa7, 0x21, 0x45, 0x75, 0x65, 0xd0, - 0x7c, 0x0a, 0xb5, 0x5d, 0x6b, 0x6f, 0x44, 0xfa, 0x13, 0xcb, 0x43, 0xcb, 0x30, 0xc7, 0x81, 0x6e, - 0x67, 0x59, 0x5b, 0xd3, 0xd6, 0x4b, 0x38, 0x02, 0xd1, 0x15, 0x98, 0xef, 0x87, 0x96, 0x1f, 0xde, - 0x27, 0x4f, 0x97, 0xf5, 0x35, 0x6d, 0x7d, 0x01, 0xc7, 0x30, 0x5a, 0x82, 0xea, 0x86, 0xe7, 0xb0, - 0x91, 0x12, 0x1f, 0x91, 0x10, 0x5a, 0x05, 0xb8, 0x4f, 0x9e, 0x06, 0x13, 0xcb, 0x66, 0x02, 0xcb, - 0x6b, 0xda, 0x7a, 0x03, 0x2b, 0x18, 0xf3, 0x4f, 0x3a, 0x18, 0xf7, 0x98, 0x2a, 0x77, 0x89, 0x15, - 0x62, 0xf2, 0x68, 0x4a, 0x82, 0x10, 0x7d, 0x1b, 0x16, 0xec, 0x7d, 0xcb, 0x1b, 0x92, 0x87, 0x84, - 0x38, 0x52, 0x8f, 0xfa, 0xed, 0xcb, 0x37, 0x14, 0x9d, 0x6f, 0xb4, 0x15, 0x02, 0x9c, 0x22, 0x47, - 0xaf, 0x43, 0xed, 0xd0, 0x0a, 0x89, 0x3f, 0xb6, 0xfc, 0x03, 0xae, 0x68, 0xfd, 0xf6, 0x52, 0x8a, - 0xf7, 0x41, 0x34, 0x8a, 0x13, 0x42, 0xf4, 0x26, 0x34, 0x7c, 0xe2, 0xd0, 0x78, 0x8c, 0x2f, 0xe4, - 0x68, 0xce, 0x34, 0x31, 0xfa, 0x06, 0xcc, 0x07, 0xa1, 0x15, 0x4e, 0x03, 0x12, 0x2c, 0x97, 0xd7, - 0x4a, 0xeb, 0xf5, 0xdb, 0x2b, 0x29, 0xc6, 0xd8, 0xbe, 0x7d, 0x4e, 0x85, 0x63, 0x6a, 0xb4, 0x0e, - 0x67, 0x6d, 0x3a, 0x9e, 0x90, 0x11, 0x09, 0x89, 0x18, 0x5c, 0xae, 0xac, 0x69, 0xeb, 0xf3, 0x38, - 0x8b, 0x46, 0xaf, 0x40, 0x89, 0xf8, 0xfe, 0x72, 0xb5, 0xc0, 0x1a, 0x78, 0xea, 0x79, 0xae, 0x37, - 0xdc, 0xf0, 0x7d, 0xea, 0x63, 0x46, 0x65, 0xfe, 0x48, 0x83, 0x5a, 0xa2, 0x9e, 0xc9, 0x2c, 0x4a, - 0xec, 0x83, 0x09, 0x75, 0xbd, 0x70, 0x37, 0xe0, 0x16, 0x2d, 0xe3, 0x14, 0x8e, 0x6d, 0x95, 0x4f, - 0x02, 0x3a, 0x7a, 0x4c, 0x9c, 0xdd, 0x80, 0xdb, 0xad, 0x8c, 0x15, 0x0c, 0x32, 0xa0, 0x14, 0x90, - 0x47, 0xdc, 0x2c, 0x65, 0xcc, 0x3e, 0x99, 0xd4, 0x91, 0x15, 0x84, 0xfd, 0xa7, 0x9e, 0xcd, 0x79, - 0xca, 0x42, 0xaa, 0x8a, 0x33, 0x3f, 0x02, 0xa3, 0xe3, 0x06, 0x13, 0x2b, 0xb4, 0xf7, 0x89, 0xdf, - 0xb2, 0x43, 0x97, 0x7a, 0xe8, 0x15, 0xa8, 0x5a, 0xfc, 0x8b, 0xeb, 0xb1, 0x78, 0xfb, 0x7c, 0x6a, - 0x2d, 0x82, 0x08, 0x4b, 0x12, 0xe6, 0x75, 0x6d, 0x3a, 0x1e, 0xbb, 0x61, 0xac, 0x54, 0x0c, 0xa3, - 0x35, 0xa8, 0x77, 0x03, 0x36, 0xd5, 0x0e, 0x5b, 0x03, 0x57, 0x6d, 0x1e, 0xab, 0x28, 0xb3, 0x0d, - 0xa5, 0x56, 0xfb, 0x7e, 0x4a, 0x88, 0x76, 0xbc, 0x10, 0x3d, 0x2f, 0x04, 0x03, 0xea, 0x0e, 0x3d, - 0xea, 0x13, 0xe7, 0xee, 0x88, 0xda, 0x07, 0x72, 0x3b, 0x4e, 0x27, 0xf3, 0x13, 0x1d, 0x2e, 0x76, - 0xbd, 0x87, 0xa3, 0x29, 0x61, 0x86, 0x4a, 0x4c, 0x14, 0xa0, 0xef, 0x40, 0x23, 0x1e, 0xd8, 0x7d, - 0x3a, 0x21, 0xd2, 0x48, 0x57, 0x52, 0x46, 0x4a, 0x51, 0xe0, 0x34, 0x03, 0x7a, 0x1b, 0x1a, 0x89, - 0xc0, 0x6e, 0x87, 0xd9, 0xad, 0x94, 0x73, 0x19, 0x95, 0x02, 0xa7, 0xe9, 0xf9, 0x49, 0xb7, 0xf7, - 0xc9, 0xd8, 0xea, 0x76, 0xb8, 0x51, 0x4b, 0x38, 0x86, 0xd1, 0x7d, 0x38, 0x4f, 0x9e, 0xd8, 0xa3, - 0xa9, 0x43, 0x14, 0x1e, 0x87, 0xef, 0xfd, 0xb1, 0x53, 0x14, 0x71, 0x99, 0xbf, 0xd0, 0x55, 0xf7, - 0x90, 0x86, 0xfd, 0x2e, 0x5c, 0x74, 0x8b, 0x2c, 0x23, 0xe3, 0x80, 0x59, 0x6c, 0x08, 0x95, 0x12, - 0x17, 0x0b, 0x40, 0x77, 0x62, 0xc7, 0x13, 0x61, 0xe1, 0xda, 0x11, 0xea, 0x66, 0x5c, 0xd0, 0x84, - 0x92, 0x65, 0x47, 0x01, 0xc1, 0x48, 0x3b, 0x6b, 0xfb, 0x3e, 0x66, 0x83, 0x68, 0x1b, 0x90, 0x9b, - 0xf3, 0x11, 0x69, 0x95, 0xeb, 0x69, 0x8d, 0x73, 0x64, 0xb8, 0x80, 0xd5, 0xfc, 0x9d, 0x06, 0xe7, - 0x94, 0xc8, 0x18, 0x4c, 0xa8, 0x17, 0x90, 0xd3, 0x86, 0xc6, 0x1e, 0x20, 0x27, 0x63, 0x6e, 0x12, - 0xb9, 0xc7, 0x51, 0xc6, 0x88, 0x74, 0xcc, 0x33, 0x22, 0x04, 0xe5, 0x31, 0x75, 0x88, 0xf4, 0x11, - 0xfe, 0x6d, 0x3e, 0x81, 0xf3, 0x6d, 0x25, 0xac, 0xf4, 0x48, 0x10, 0x58, 0xc3, 0x53, 0x2b, 0x9e, - 0x0d, 0x60, 0x7a, 0x3e, 0x80, 0x99, 0x3f, 0xd3, 0xe0, 0x2c, 0x26, 0x0e, 0xed, 0x91, 0xd0, 0x9a, - 0xd1, 0xb4, 0x27, 0xc5, 0xc4, 0xac, 0x5a, 0xa5, 0x02, 0xb5, 0x7e, 0x00, 0xd7, 0x98, 0x56, 0x38, - 0xe6, 0xda, 0xf1, 0xe9, 0xd0, 0x27, 0x41, 0xf0, 0xd5, 0xe8, 0x68, 0x7e, 0x04, 0x2b, 0xe9, 0xf9, - 0xdf, 0xa1, 0xfe, 0xa1, 0xe5, 0x3b, 0x5f, 0xd1, 0xf4, 0xff, 0xd4, 0xd4, 0x23, 0xde, 0xa6, 0xde, - 0x43, 0x77, 0x88, 0x9a, 0x50, 0x0e, 0x26, 0x96, 0x27, 0xe7, 0x5a, 0x2a, 0x4e, 0x95, 0x98, 0xd3, - 0xb0, 0x82, 0x24, 0x60, 0x65, 0x46, 0x2c, 0x3d, 0x02, 0x99, 0xe6, 0x8e, 0x12, 0x62, 0xe4, 0x01, - 0x3d, 0x26, 0x06, 0xa5, 0xc8, 0x59, 0x94, 0x0b, 0xa2, 0x28, 0x57, 0x16, 0x51, 0x2e, 0x82, 0x63, - 0xcf, 0xae, 0x24, 0x9e, 0x8d, 0x9a, 0x60, 0x04, 0x07, 0xee, 0xa4, 0xd3, 0xdb, 0x6c, 0x05, 0x7d, - 0xa9, 0x51, 0x95, 0x47, 0xf6, 0x1c, 0xde, 0xfc, 0xb9, 0x0e, 0x97, 0x59, 0xc8, 0x74, 0xa6, 0x23, - 0x25, 0xe2, 0xcd, 0xa8, 0xc0, 0xb9, 0x03, 0x55, 0x9b, 0xdb, 0xf1, 0x84, 0x30, 0x26, 0x8c, 0x8d, - 0x25, 0x31, 0x6a, 0xc3, 0x62, 0x20, 0x55, 0x12, 0x01, 0x8e, 0x1b, 0x6c, 0xf1, 0xf6, 0xd5, 0x14, - 0x7b, 0x3f, 0x45, 0x82, 0x33, 0x2c, 0x4c, 0x75, 0x3a, 0x21, 0xbe, 0x15, 0x52, 0x9f, 0x27, 0xa7, - 0x32, 0x17, 0x91, 0x56, 0x7d, 0x5b, 0x21, 0xc0, 0x29, 0x72, 0xf3, 0x4b, 0x0d, 0x96, 0x7a, 0xc4, - 0x1f, 0xce, 0xde, 0x28, 0x6f, 0x43, 0xc3, 0x79, 0xc1, 0xa4, 0x97, 0xa2, 0x47, 0x5d, 0x40, 0x63, - 0xa6, 0x99, 0xd3, 0x79, 0x21, 0x9f, 0x2a, 0x60, 0x8a, 0xbd, 0xa7, 0xac, 0xc4, 0xc5, 0x1d, 0x38, - 0xdf, 0xb3, 0x5c, 0x2f, 0xb4, 0x5c, 0x8f, 0xf8, 0xf7, 0x22, 0x69, 0xe8, 0x9b, 0x4a, 0xe1, 0xa8, - 0x15, 0xc4, 0xe1, 0x84, 0x27, 0x5b, 0x39, 0x9a, 0x9f, 0x69, 0xd0, 0xe8, 0xf8, 0x96, 0xeb, 0x45, - 0x01, 0x05, 0xbd, 0x0c, 0x8b, 0xa1, 0xe5, 0x0f, 0x49, 0x38, 0xf0, 0xa8, 0x43, 0x06, 0xae, 0xc3, - 0x8d, 0x58, 0xc3, 0x0b, 0x02, 0xbb, 0x45, 0x1d, 0xd2, 0x75, 0xd0, 0x4b, 0x20, 0xe1, 0x01, 0x99, - 0x50, 0x7b, 0x5f, 0x9e, 0xaa, 0xba, 0xc0, 0x6d, 0x30, 0x14, 0xfa, 0x1a, 0x5c, 0x92, 0x24, 0x89, - 0x8d, 0x06, 0x36, 0x9d, 0xca, 0x22, 0xab, 0x81, 0x2f, 0x8a, 0x61, 0xd5, 0xd7, 0xa6, 0x5e, 0x88, - 0xde, 0x81, 0x35, 0xc9, 0xc7, 0x12, 0xb0, 0x3b, 0xdc, 0x0f, 0x07, 0x0e, 0xd3, 0x70, 0x30, 0xa6, - 0x8f, 0x89, 0x14, 0x20, 0x2e, 0x01, 0x2b, 0x82, 0xae, 0x2b, 0xc9, 0xf8, 0x3a, 0x7a, 0xf4, 0x31, - 0xe1, 0x72, 0xcc, 0x4f, 0x4a, 0x60, 0x64, 0x57, 0x7e, 0x5a, 0x07, 0xb9, 0x06, 0xc0, 0xbe, 0x06, - 0xcc, 0x7e, 0x84, 0x2f, 0xba, 0x86, 0x6b, 0x0c, 0xc3, 0xc4, 0x13, 0x74, 0x0b, 0x2a, 0x62, 0xa4, - 0xe8, 0x50, 0xb4, 0xe9, 0x78, 0x42, 0x3d, 0xe2, 0x85, 0x9c, 0x16, 0x0b, 0x4a, 0xf4, 0x5f, 0xd0, - 0x48, 0x22, 0xfd, 0x20, 0x8c, 0x0b, 0xe0, 0x54, 0x59, 0x2d, 0xab, 0xf6, 0x4a, 0x81, 0x37, 0xe6, - 0xaa, 0x76, 0xf4, 0xdf, 0xb0, 0xb8, 0x47, 0x69, 0x18, 0x84, 0xbe, 0x35, 0x19, 0x38, 0xd4, 0x23, - 0x32, 0xc0, 0x34, 0x62, 0x6c, 0x87, 0x7a, 0x24, 0x57, 0x78, 0xcf, 0xe5, 0x0b, 0x6f, 0xd4, 0x82, - 0x45, 0x61, 0xfa, 0x89, 0xf4, 0x8e, 0xe5, 0x79, 0x6e, 0xaf, 0x74, 0x1d, 0x99, 0xf2, 0x1f, 0xdc, - 0x70, 0x54, 0xd0, 0xfc, 0x9b, 0x06, 0x0d, 0xe6, 0x33, 0x89, 0xb7, 0xde, 0x81, 0xf9, 0x91, 0xfb, - 0x98, 0x78, 0x4c, 0x9c, 0x56, 0x70, 0xf2, 0x19, 0xf5, 0xa6, 0x24, 0xc0, 0x31, 0x29, 0x33, 0x3d, - 0x77, 0x48, 0xd5, 0xdf, 0x6a, 0x0c, 0x23, 0xbc, 0xad, 0x03, 0xd7, 0x15, 0x37, 0x13, 0x5a, 0x67, - 0xfc, 0xb8, 0xc4, 0xb7, 0xeb, 0x6a, 0x42, 0xc6, 0x15, 0xdf, 0x55, 0xdd, 0xba, 0x05, 0xd7, 0x8e, - 0x92, 0x22, 0xe6, 0x15, 0xbb, 0x73, 0xa5, 0x50, 0x06, 0x57, 0xc4, 0xfc, 0x10, 0x96, 0xfa, 0x42, - 0x5e, 0xbc, 0x08, 0x19, 0x9c, 0x6e, 0x41, 0x55, 0xc8, 0x3a, 0x79, 0xd9, 0x92, 0xf0, 0x84, 0x45, - 0x9b, 0x63, 0xb8, 0x94, 0x9b, 0x4b, 0x16, 0x79, 0xaf, 0xc1, 0x9c, 0x35, 0x99, 0x8c, 0x5c, 0xe2, - 0x9c, 0x3c, 0x5b, 0x44, 0x79, 0xd2, 0x74, 0x1f, 0xc2, 0xf5, 0xbe, 0x7a, 0x5e, 0x95, 0xb5, 0x47, - 0x6b, 0x9c, 0x55, 0xf4, 0x30, 0xbf, 0x0e, 0x57, 0xdb, 0x94, 0xfa, 0x8e, 0xeb, 0xb1, 0xb8, 0x7f, - 0x37, 0x72, 0xdd, 0x68, 0x9e, 0x65, 0x98, 0x7b, 0x4c, 0xfc, 0x20, 0xba, 0xff, 0x95, 0x70, 0x04, - 0xb2, 0xeb, 0xc0, 0x4a, 0x31, 0xa7, 0xb4, 0xcc, 0xbf, 0x1f, 0x2d, 0xd1, 0xeb, 0xb0, 0x14, 0x9f, - 0x87, 0x90, 0xda, 0x74, 0x34, 0x88, 0x94, 0xd0, 0x79, 0x40, 0xba, 0x10, 0xf9, 0x3e, 0x1f, 0x7c, - 0x5f, 0x8c, 0xfd, 0xe7, 0xb8, 0xe6, 0x1f, 0x35, 0xb8, 0xd0, 0x72, 0x9c, 0x64, 0x81, 0x91, 0x35, - 0xff, 0x17, 0x74, 0xb9, 0x53, 0xc7, 0xc6, 0x42, 0xdd, 0x75, 0xd0, 0x52, 0xaa, 0x6e, 0x58, 0x88, - 0x0b, 0x83, 0x5c, 0x1c, 0x2b, 0x28, 0x63, 0x51, 0x13, 0xce, 0xb9, 0xc1, 0xc0, 0x23, 0x87, 0x83, - 0x24, 0xaa, 0x72, 0xbd, 0xe7, 0xf1, 0x59, 0x37, 0xd8, 0x22, 0x87, 0xc9, 0x74, 0xe8, 0x3a, 0xd4, - 0x0f, 0x64, 0x8f, 0x87, 0x59, 0xa8, 0x22, 0xda, 0x3e, 0x11, 0xaa, 0xeb, 0x98, 0xbf, 0xd4, 0xe0, - 0x12, 0x26, 0x2c, 0x29, 0x9c, 0x6a, 0x41, 0xcb, 0x30, 0x67, 0x5b, 0x81, 0x6d, 0x39, 0x44, 0x5e, - 0xb1, 0x23, 0x90, 0x8d, 0xf8, 0x5c, 0xbe, 0x23, 0xbb, 0x02, 0x11, 0x98, 0xd5, 0xad, 0x9c, 0xd3, - 0xed, 0xd7, 0x25, 0xb8, 0x92, 0x68, 0x95, 0xf3, 0xde, 0x53, 0x66, 0xa1, 0xa3, 0xf6, 0xe0, 0x32, - 0xf7, 0x6c, 0x5f, 0x31, 0x7f, 0x5c, 0xe6, 0xda, 0xf0, 0x52, 0xc8, 0x6a, 0xe2, 0x41, 0xe8, 0xbb, - 0xc3, 0x21, 0xf1, 0x07, 0xe4, 0x31, 0xf1, 0x52, 0x99, 0xd9, 0x7d, 0x8e, 0xfb, 0xf7, 0x35, 0x2e, - 0x63, 0x57, 0x88, 0xd8, 0x60, 0x12, 0xd4, 0x9b, 0x78, 0xf1, 0xf6, 0x56, 0x8a, 0xb7, 0xd7, 0x62, - 0x59, 0x5e, 0x55, 0xc8, 0x27, 0x0e, 0xcd, 0xe8, 0x53, 0x3d, 0x49, 0x9f, 0x15, 0x55, 0x1f, 0x76, - 0x41, 0x49, 0xa9, 0x93, 0xd9, 0xa5, 0xb9, 0xdc, 0x2e, 0xfd, 0x59, 0x87, 0xab, 0x85, 0xbb, 0x34, - 0x9b, 0x8b, 0xf2, 0x1d, 0xa8, 0xb0, 0xcb, 0x47, 0x54, 0x45, 0xa6, 0x6f, 0xf0, 0xf1, 0x6c, 0xc9, - 0x55, 0x45, 0x50, 0x47, 0xc9, 0xbe, 0xf4, 0x3c, 0x2d, 0xba, 0xe7, 0x2b, 0x1f, 0x5e, 0x05, 0xc4, - 0xad, 0x9b, 0xa6, 0xac, 0x70, 0x4a, 0x83, 0x8d, 0xa8, 0x97, 0x6d, 0xd4, 0x81, 0x5a, 0x54, 0x6e, - 0xb3, 0xbb, 0x09, 0x53, 0xfd, 0x7f, 0x0a, 0xab, 0xfb, 0x5c, 0xf9, 0x8d, 0x13, 0x46, 0xf3, 0xb7, - 0x3a, 0xac, 0x26, 0xb6, 0xdd, 0xa1, 0x41, 0x38, 0xeb, 0x53, 0xf0, 0x5c, 0x2e, 0xad, 0x9f, 0xd2, - 0xa5, 0x6f, 0xc1, 0x9c, 0xb8, 0xcf, 0xb1, 0x13, 0xc5, 0x4c, 0x71, 0x29, 0x67, 0x8a, 0xb1, 0xd5, - 0xf5, 0x1e, 0x52, 0x1c, 0xd1, 0xa1, 0x37, 0x60, 0x81, 0x5b, 0x3b, 0xe2, 0x2b, 0x1f, 0xcf, 0x57, - 0x67, 0xc4, 0x02, 0x0e, 0xcc, 0x7f, 0x68, 0x70, 0xfd, 0x48, 0xab, 0xcd, 0xc6, 0x2b, 0xbf, 0x12, - 0xb3, 0xbd, 0x88, 0x0f, 0x9b, 0x4f, 0x00, 0x12, 0x7b, 0xa4, 0xfa, 0x86, 0x5a, 0xa6, 0x6f, 0xb8, - 0x1a, 0x51, 0x6e, 0x59, 0xe3, 0xa8, 0xfc, 0x56, 0x30, 0xe8, 0x06, 0x54, 0xf9, 0x71, 0x8a, 0x36, - 0xab, 0xa0, 0x29, 0xc0, 0x6d, 0x2e, 0xa9, 0xcc, 0xb6, 0x7c, 0xb4, 0xe0, 0x13, 0x1f, 0xfd, 0x68, - 0xb1, 0x22, 0xc9, 0x94, 0x59, 0x13, 0x84, 0xf9, 0x7b, 0x1d, 0x50, 0xfe, 0x34, 0xb3, 0x14, 0x74, - 0xc4, 0xe6, 0xa4, 0x0c, 0xa9, 0xcb, 0x47, 0x91, 0x68, 0xc9, 0x7a, 0x66, 0xc9, 0x51, 0x97, 0xa3, - 0xf4, 0x1c, 0x5d, 0x8e, 0x77, 0xc0, 0xb0, 0xa3, 0x4b, 0xc6, 0x20, 0x48, 0xba, 0x87, 0x27, 0xdc, - 0x44, 0xce, 0xda, 0x2a, 0x3c, 0x0d, 0xf2, 0x41, 0xa5, 0x52, 0x10, 0x54, 0x5e, 0x83, 0xfa, 0xde, - 0x88, 0xda, 0x07, 0xf2, 0x2e, 0x24, 0x62, 0x35, 0x4a, 0x7b, 0x39, 0x17, 0x0f, 0x7b, 0x51, 0x47, - 0x92, 0xc4, 0x97, 0xda, 0x39, 0xe5, 0x52, 0xfb, 0x08, 0x96, 0x12, 0x97, 0x6f, 0x8f, 0x68, 0x40, - 0x66, 0x14, 0x20, 0x94, 0xfc, 0xad, 0xa7, 0xf2, 0xb7, 0xe9, 0xc3, 0xa5, 0xdc, 0x94, 0xb3, 0x39, - 0x5d, 0xcb, 0x30, 0x17, 0x4c, 0x6d, 0x9b, 0xdd, 0x6d, 0xe4, 0x9c, 0x12, 0x34, 0x7f, 0xac, 0x81, - 0x91, 0x34, 0x9a, 0x85, 0x03, 0xce, 0xa0, 0x4f, 0x7f, 0x05, 0xe6, 0xa5, 0x9b, 0x8a, 0x3c, 0x53, - 0xc2, 0x31, 0x7c, 0x5c, 0x0b, 0xde, 0xfc, 0x1e, 0x54, 0x38, 0xdd, 0x09, 0x6f, 0x75, 0x47, 0xb9, - 0xe5, 0x0a, 0xd4, 0xfa, 0x93, 0x91, 0xcb, 0xa3, 0x80, 0xac, 0x8e, 0x12, 0x84, 0xf9, 0xb1, 0x0e, - 0xe7, 0x31, 0x9d, 0x86, 0x84, 0x8b, 0x6a, 0x39, 0x63, 0x37, 0xe0, 0x95, 0x70, 0x13, 0x8c, 0x3e, - 0x9d, 0xfa, 0x36, 0x51, 0x4e, 0xb1, 0xb8, 0x1f, 0xe4, 0xf0, 0x68, 0x1d, 0xce, 0x0a, 0x5c, 0xf6, - 0xe8, 0x65, 0xd1, 0x4c, 0xaa, 0xa8, 0x72, 0x15, 0xa9, 0xa2, 0xa0, 0xce, 0xe1, 0x99, 0x54, 0x81, - 0x4b, 0xa4, 0x96, 0x85, 0xd4, 0x0c, 0x1a, 0xbd, 0x05, 0x55, 0xd9, 0xe1, 0xaa, 0xf0, 0x3d, 0x49, - 0xe7, 0xc0, 0x82, 0xd5, 0x45, 0x0d, 0x7f, 0xf1, 0x6b, 0x7a, 0xb0, 0x18, 0x59, 0x4b, 0xf8, 0xcb, - 0x31, 0x96, 0x5e, 0x83, 0xfa, 0xf6, 0xc8, 0xc9, 0x18, 0x5b, 0x45, 0x31, 0x8a, 0x2d, 0x72, 0x98, - 0xd9, 0x4d, 0x15, 0x65, 0x7e, 0x59, 0x82, 0x8a, 0x38, 0x64, 0x2b, 0x50, 0xeb, 0x06, 0xfc, 0x19, - 0x40, 0x5e, 0xfe, 0xe6, 0x71, 0x82, 0x60, 0x5a, 0xf0, 0xcf, 0xa4, 0x15, 0x2a, 0x41, 0xf4, 0x36, - 0xd4, 0xc5, 0x67, 0x14, 0x42, 0xf3, 0x7d, 0xc1, 0xac, 0x03, 0x63, 0x95, 0x03, 0xdd, 0x87, 0x73, - 0x5b, 0x84, 0x38, 0x1d, 0x9f, 0x4e, 0x26, 0x11, 0x85, 0x2c, 0x2a, 0x4f, 0x10, 0x93, 0xe7, 0x43, - 0x6f, 0xc2, 0x59, 0x86, 0x6c, 0x39, 0x4e, 0x2c, 0x4a, 0xf4, 0x3f, 0x50, 0x3e, 0x06, 0xe2, 0x2c, - 0x29, 0x6a, 0xc3, 0xe2, 0x7b, 0x13, 0xc7, 0x0a, 0x89, 0x34, 0x61, 0x54, 0xc9, 0x5c, 0x2d, 0x4a, - 0xc3, 0x72, 0x83, 0x70, 0x86, 0x25, 0xfb, 0x02, 0x37, 0x97, 0x7b, 0x81, 0x43, 0xff, 0xc7, 0x1b, - 0x3e, 0x43, 0xc2, 0xfb, 0x22, 0x8b, 0x99, 0x24, 0x1f, 0xbd, 0xc4, 0x0c, 0x45, 0xb3, 0x67, 0x48, - 0xd0, 0x2e, 0x5c, 0x28, 0x70, 0x9c, 0x60, 0xb9, 0xc6, 0x75, 0x5b, 0x3b, 0xc9, 0xc3, 0x70, 0x21, - 0xb7, 0xf9, 0x43, 0xb8, 0x10, 0x67, 0x02, 0xf5, 0x71, 0xf1, 0x05, 0x32, 0xd0, 0x7a, 0xd4, 0xb8, - 0xd2, 0x8f, 0x0c, 0xe3, 0xb2, 0x5f, 0x55, 0xf4, 0x5c, 0xf3, 0x77, 0x8d, 0x9d, 0xaa, 0xd4, 0xe3, - 0xf4, 0x8b, 0x4c, 0x5e, 0x94, 0xb6, 0xf4, 0x59, 0xa4, 0xad, 0xa2, 0x2b, 0xe8, 0x2d, 0xb8, 0x28, - 0x0a, 0x9e, 0xc0, 0x7d, 0x46, 0x06, 0x13, 0xe2, 0x0f, 0x02, 0x62, 0x53, 0x4f, 0x5c, 0x7e, 0x74, - 0x8c, 0xf8, 0x60, 0xdf, 0x7d, 0x46, 0x76, 0x88, 0xdf, 0xe7, 0x23, 0x45, 0x7d, 0x7c, 0xf3, 0x37, - 0x1a, 0x20, 0xf5, 0xf5, 0x6d, 0x36, 0x19, 0xeb, 0x5d, 0x68, 0xec, 0x25, 0x42, 0xe3, 0x57, 0xb5, - 0x97, 0x8a, 0xb3, 0xbe, 0x3a, 0x7f, 0x9a, 0xaf, 0x70, 0x97, 0x1c, 0x58, 0x50, 0x6b, 0x2f, 0x46, - 0x13, 0xba, 0x71, 0x00, 0xe6, 0xdf, 0x0c, 0xe7, 0x51, 0x27, 0x8a, 0xb4, 0xfc, 0x9b, 0xe1, 0xec, - 0x48, 0x56, 0x0d, 0xf3, 0x6f, 0x16, 0x44, 0xc6, 0xe2, 0xe9, 0x47, 0x86, 0xcf, 0x08, 0x34, 0x5f, - 0x87, 0x85, 0x6c, 0x1b, 0x7b, 0xdf, 0x1d, 0xee, 0xcb, 0xd7, 0x6d, 0xfe, 0x8d, 0x0c, 0x28, 0x8d, - 0xe8, 0xa1, 0x0c, 0x3f, 0xec, 0x93, 0xe9, 0xa6, 0x9a, 0xe5, 0xf9, 0xb8, 0xb8, 0xb6, 0x49, 0xb0, - 0xe7, 0xdf, 0x2c, 0x69, 0x45, 0x37, 0x3c, 0xa9, 0x5a, 0x0c, 0x9b, 0xdf, 0x87, 0xeb, 0x9b, 0x74, - 0xa8, 0x34, 0x87, 0x92, 0xf7, 0xac, 0xd9, 0x6c, 0xa0, 0xf9, 0xb1, 0x06, 0x6b, 0x47, 0x4f, 0x31, - 0x9b, 0x12, 0xe3, 0xa4, 0xc7, 0xb2, 0x11, 0xb3, 0x25, 0xb1, 0x0f, 0x82, 0xe9, 0xb8, 0x47, 0x42, - 0x0b, 0xfd, 0x7f, 0x74, 0xb6, 0x8b, 0x6a, 0x8b, 0x88, 0x32, 0x75, 0xc6, 0x9b, 0x60, 0xd8, 0x2a, - 0xbe, 0x4f, 0x1e, 0xc9, 0x79, 0x72, 0x78, 0xf3, 0xa7, 0x1a, 0x5c, 0x54, 0xde, 0x79, 0x49, 0x18, - 0x49, 0x44, 0x17, 0xa0, 0x22, 0x9a, 0xf5, 0x62, 0x13, 0x05, 0xc0, 0x3c, 0xe7, 0x09, 0xf5, 0xef, - 0xb1, 0xcd, 0x95, 0xe9, 0x47, 0x82, 0x68, 0x09, 0xaa, 0x4f, 0xa8, 0xbf, 0x49, 0x0f, 0xe5, 0xb9, - 0x95, 0x90, 0x28, 0xa9, 0xc6, 0x9c, 0xa3, 0x2c, 0x9b, 0x1a, 0x02, 0x64, 0x1c, 0xc1, 0x74, 0xcc, - 0x38, 0x44, 0x81, 0x2a, 0x21, 0xf3, 0x57, 0x1a, 0xac, 0x15, 0xea, 0xd4, 0xb2, 0x0f, 0x66, 0xb5, - 0x0b, 0x17, 0xa0, 0xa2, 0xf6, 0x2e, 0x05, 0x50, 0x74, 0xee, 0xa2, 0xff, 0xbc, 0x94, 0xe3, 0xff, - 0xbc, 0x98, 0x7f, 0xd1, 0xc0, 0x2c, 0xd4, 0x4f, 0xe4, 0x9f, 0x19, 0x05, 0x93, 0x53, 0x68, 0x88, - 0xde, 0x82, 0xf9, 0x68, 0xa7, 0xb9, 0x6d, 0xb3, 0xff, 0x98, 0x28, 0xd4, 0x1e, 0xc7, 0x3c, 0xcd, - 0x6b, 0x51, 0xf1, 0x84, 0x6a, 0x50, 0x79, 0xe0, 0xbb, 0x21, 0x31, 0xce, 0xa0, 0x79, 0x28, 0xef, - 0x58, 0x41, 0x60, 0x68, 0xcd, 0x75, 0x51, 0x1b, 0x29, 0x4f, 0x82, 0x00, 0xd5, 0xb6, 0x4f, 0x2c, - 0x4e, 0x07, 0x50, 0x15, 0x7d, 0x3d, 0x43, 0x6b, 0xf6, 0x60, 0x41, 0x7d, 0x09, 0x64, 0xe2, 0xb6, - 0x07, 0x2d, 0xc7, 0x31, 0xce, 0xa0, 0x05, 0x98, 0xdf, 0x1e, 0x44, 0x84, 0x8c, 0x69, 0x7b, 0xd0, - 0x63, 0xdf, 0x3a, 0xaa, 0xc3, 0xdc, 0xf6, 0x80, 0x57, 0xa3, 0x46, 0x49, 0x00, 0xfc, 0xb9, 0xd0, - 0x28, 0x37, 0xef, 0xc0, 0x82, 0xda, 0xf9, 0x66, 0xe2, 0x5a, 0x9b, 0xdd, 0xf7, 0x37, 0x84, 0xb8, - 0x0e, 0x6e, 0x75, 0xb7, 0xba, 0x5b, 0xef, 0x1a, 0x1a, 0x83, 0xfa, 0xbb, 0xdb, 0x3b, 0x3b, 0x0c, - 0xd2, 0x9b, 0x6f, 0x00, 0x24, 0xc9, 0x9c, 0xad, 0x63, 0x6b, 0x7b, 0x8b, 0xf1, 0xd4, 0x61, 0xee, - 0x41, 0xab, 0xbb, 0x2b, 0x58, 0x18, 0x80, 0x05, 0xa0, 0x33, 0x9a, 0x0e, 0xa3, 0x29, 0x35, 0x5f, - 0xcd, 0x94, 0xf8, 0x68, 0x0e, 0x4a, 0xad, 0xd1, 0xc8, 0x38, 0x83, 0xaa, 0xa0, 0x77, 0xee, 0x0a, - 0xd5, 0xb7, 0xa8, 0x3f, 0xb6, 0x46, 0x86, 0xde, 0x7c, 0x17, 0x2e, 0x1f, 0x59, 0x5a, 0x72, 0x6d, - 0x3b, 0xbd, 0xee, 0xae, 0x98, 0x19, 0x6f, 0x6c, 0x6e, 0xb4, 0xfa, 0x1b, 0x86, 0x86, 0x10, 0x2c, - 0x4a, 0x60, 0xd0, 0x6f, 0xdf, 0xdb, 0xe8, 0xb5, 0x0c, 0xbd, 0xf9, 0x0c, 0x16, 0xd3, 0xf9, 0x92, - 0xeb, 0x47, 0xfd, 0x03, 0xd7, 0x1b, 0x0a, 0xfe, 0x7e, 0xc8, 0xcb, 0x2d, 0xa1, 0xb9, 0xb0, 0xa3, - 0x63, 0xe8, 0xc8, 0x80, 0x85, 0xae, 0xe7, 0x86, 0xae, 0x35, 0x72, 0x9f, 0x31, 0xda, 0x12, 0x6a, - 0x40, 0x6d, 0xc7, 0x27, 0x13, 0xcb, 0x67, 0x60, 0x19, 0x2d, 0x02, 0x70, 0x73, 0x62, 0x62, 0x39, - 0x4f, 0x8d, 0x0a, 0x63, 0x78, 0x60, 0xb9, 0xa1, 0xeb, 0x0d, 0x85, 0x95, 0xab, 0xcd, 0x6f, 0x41, - 0x23, 0x15, 0x57, 0xd0, 0x39, 0x68, 0xbc, 0xb7, 0xd5, 0xdd, 0xea, 0xee, 0x76, 0x5b, 0x9b, 0xdd, - 0x0f, 0x36, 0x3a, 0xc2, 0xdc, 0xbd, 0x6e, 0xbf, 0xd7, 0xda, 0x6d, 0xdf, 0x33, 0x34, 0xb6, 0x32, - 0xf1, 0xa9, 0xdf, 0x7d, 0xeb, 0x0f, 0x9f, 0xaf, 0x6a, 0x9f, 0x7e, 0xbe, 0xaa, 0xfd, 0xf5, 0xf3, - 0x55, 0xed, 0x27, 0x5f, 0xac, 0x9e, 0xf9, 0xf4, 0x8b, 0xd5, 0x33, 0x9f, 0x7d, 0xb1, 0x7a, 0xe6, - 0x83, 0x97, 0x87, 0x6e, 0xb8, 0x3f, 0xdd, 0xbb, 0x61, 0xd3, 0xf1, 0xcd, 0x89, 0xeb, 0x0d, 0x6d, - 0x6b, 0x72, 0x33, 0x74, 0x6d, 0xc7, 0xbe, 0xa9, 0xb8, 0xe6, 0x5e, 0x95, 0xf7, 0xe6, 0x5f, 0xfb, - 0x57, 0x00, 0x00, 0x00, 0xff, 0xff, 0xe3, 0xbe, 0xe3, 0x8e, 0xbb, 0x28, 0x00, 0x00, + // 2989 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xcc, 0x3a, 0x4b, 0x6f, 0x1c, 0xc7, + 0xd1, 0x9a, 0xd9, 0x17, 0xb7, 0xc8, 0xa5, 0x46, 0x2d, 0x89, 0xa2, 0x24, 0x8a, 0xa2, 0xe7, 0xf3, + 0xf7, 0x81, 0x5e, 0xfb, 0x93, 0x22, 0xd9, 0xca, 0xc3, 0x71, 0xec, 0xac, 0x76, 0x69, 0x6b, 0x21, + 0x2e, 0x49, 0xf4, 0xd2, 0x56, 0xe0, 0x1c, 0x36, 0xc3, 0x99, 0xd6, 0x72, 0xcc, 0xdd, 0xe9, 0xf5, + 0xcc, 0xac, 0x28, 0x09, 0x70, 0x02, 0x23, 0xc9, 0x2d, 0x87, 0x04, 0xc8, 0x21, 0x87, 0xe4, 0x92, + 0x1f, 0x10, 0xe4, 0x0f, 0x04, 0xc8, 0x21, 0x87, 0x9c, 0x02, 0x9f, 0x02, 0x9f, 0x12, 0xc3, 0xbe, + 0x07, 0x01, 0x02, 0x24, 0xd7, 0xa0, 0x1f, 0x33, 0xd3, 0x33, 0x3b, 0xcb, 0x47, 0xb8, 0x30, 0x72, + 0xda, 0xa9, 0xea, 0xaa, 0xea, 0xea, 0xea, 0xea, 0xea, 0xea, 0xaa, 0x85, 0xeb, 0xfb, 0xc4, 0xf2, + 0xc3, 0x3d, 0x62, 0x85, 0xa3, 0xbd, 0xdb, 0xf1, 0xf7, 0xad, 0x91, 0x4f, 0x43, 0x8a, 0xe6, 0x95, + 0x41, 0xf3, 0x19, 0x54, 0x77, 0xad, 0xbd, 0x01, 0xe9, 0x8e, 0x2c, 0x0f, 0x2d, 0x43, 0x85, 0x03, + 0xed, 0xd6, 0xb2, 0xb6, 0xa6, 0xad, 0x17, 0x70, 0x04, 0xa2, 0x6b, 0x30, 0xd7, 0x0d, 0x2d, 0x3f, + 0x7c, 0x48, 0x9e, 0x2d, 0xeb, 0x6b, 0xda, 0xfa, 0x02, 0x8e, 0x61, 0xb4, 0x04, 0xe5, 0x0d, 0xcf, + 0x61, 0x23, 0x05, 0x3e, 0x22, 0x21, 0xb4, 0x0a, 0xf0, 0x90, 0x3c, 0x0b, 0x46, 0x96, 0xcd, 0x04, + 0x16, 0xd7, 0xb4, 0xf5, 0x1a, 0x56, 0x30, 0xe6, 0x9f, 0x75, 0x30, 0x1e, 0x30, 0x55, 0xee, 0x13, + 0x2b, 0xc4, 0xe4, 0xc3, 0x31, 0x09, 0x42, 0xf4, 0x2d, 0x58, 0xb0, 0xf7, 0x2d, 0xaf, 0x4f, 0x1e, + 0x13, 0xe2, 0x48, 0x3d, 0xe6, 0xef, 0x5e, 0xbd, 0xa5, 0xe8, 0x7c, 0xab, 0xa9, 0x10, 0xe0, 0x14, + 0x39, 0x7a, 0x0d, 0xaa, 0x87, 0x56, 0x48, 0xfc, 0xa1, 0xe5, 0x1f, 0x70, 0x45, 0xe7, 0xef, 0x2e, + 0xa5, 0x78, 0x1f, 0x45, 0xa3, 0x38, 0x21, 0x44, 0x6f, 0x40, 0xcd, 0x27, 0x0e, 0x8d, 0xc7, 0xf8, + 0x42, 0xa6, 0x73, 0xa6, 0x89, 0xd1, 0xd7, 0x61, 0x2e, 0x08, 0xad, 0x70, 0x1c, 0x90, 0x60, 0xb9, + 0xb8, 0x56, 0x58, 0x9f, 0xbf, 0xbb, 0x92, 0x62, 0x8c, 0xed, 0xdb, 0xe5, 0x54, 0x38, 0xa6, 0x46, + 0xeb, 0x70, 0xde, 0xa6, 0xc3, 0x11, 0x19, 0x90, 0x90, 0x88, 0xc1, 0xe5, 0xd2, 0x9a, 0xb6, 0x3e, + 0x87, 0xb3, 0x68, 0xf4, 0x32, 0x14, 0x88, 0xef, 0x2f, 0x97, 0x73, 0xac, 0x81, 0xc7, 0x9e, 0xe7, + 0x7a, 0xfd, 0x0d, 0xdf, 0xa7, 0x3e, 0x66, 0x54, 0xe6, 0x8f, 0x35, 0xa8, 0x26, 0xea, 0x99, 0xcc, + 0xa2, 0xc4, 0x3e, 0x18, 0x51, 0xd7, 0x0b, 0x77, 0x03, 0x6e, 0xd1, 0x22, 0x4e, 0xe1, 0xd8, 0x56, + 0xf9, 0x24, 0xa0, 0x83, 0x27, 0xc4, 0xd9, 0x0d, 0xb8, 0xdd, 0x8a, 0x58, 0xc1, 0x20, 0x03, 0x0a, + 0x01, 0xf9, 0x90, 0x9b, 0xa5, 0x88, 0xd9, 0x27, 0x93, 0x3a, 0xb0, 0x82, 0xb0, 0xfb, 0xcc, 0xb3, + 0x39, 0x4f, 0x51, 0x48, 0x55, 0x71, 0xe6, 0x47, 0x60, 0xb4, 0xdc, 0x60, 0x64, 0x85, 0xf6, 0x3e, + 0xf1, 0x1b, 0x76, 0xe8, 0x52, 0x0f, 0xbd, 0x0c, 0x65, 0x8b, 0x7f, 0x71, 0x3d, 0x16, 0xef, 0x5e, + 0x4c, 0xad, 0x45, 0x10, 0x61, 0x49, 0xc2, 0xbc, 0xae, 0x49, 0x87, 0x43, 0x37, 0x8c, 0x95, 0x8a, + 0x61, 0xb4, 0x06, 0xf3, 0xed, 0x80, 0x4d, 0xb5, 0xc3, 0xd6, 0xc0, 0x55, 0x9b, 0xc3, 0x2a, 0xca, + 0x6c, 0x42, 0xa1, 0xd1, 0x7c, 0x98, 0x12, 0xa2, 0x1d, 0x2d, 0x44, 0x9f, 0x14, 0x82, 0x01, 0xb5, + 0xfb, 0x1e, 0xf5, 0x89, 0x73, 0x7f, 0x40, 0xed, 0x03, 0xb9, 0x1d, 0x67, 0x93, 0xf9, 0x43, 0x1d, + 0x2e, 0xb7, 0xbd, 0xc7, 0x83, 0x31, 0x61, 0x86, 0x4a, 0x4c, 0x14, 0xa0, 0x6f, 0x43, 0x2d, 0x1e, + 0xd8, 0x7d, 0x36, 0x22, 0xd2, 0x48, 0xd7, 0x52, 0x46, 0x4a, 0x51, 0xe0, 0x34, 0x03, 0x7a, 0x0b, + 0x6a, 0x89, 0xc0, 0x76, 0x8b, 0xd9, 0xad, 0x30, 0xe1, 0x32, 0x2a, 0x05, 0x4e, 0xd3, 0xf3, 0x93, + 0x6e, 0xef, 0x93, 0xa1, 0xd5, 0x6e, 0x71, 0xa3, 0x16, 0x70, 0x0c, 0xa3, 0x87, 0x70, 0x91, 0x3c, + 0xb5, 0x07, 0x63, 0x87, 0x28, 0x3c, 0x0e, 0xdf, 0xfb, 0x23, 0xa7, 0xc8, 0xe3, 0x32, 0x7f, 0xa1, + 0xab, 0xee, 0x21, 0x0d, 0xfb, 0x1d, 0xb8, 0xec, 0xe6, 0x59, 0x46, 0xc6, 0x01, 0x33, 0xdf, 0x10, + 0x2a, 0x25, 0xce, 0x17, 0x80, 0xee, 0xc5, 0x8e, 0x27, 0xc2, 0xc2, 0x8d, 0x29, 0xea, 0x66, 0x5c, + 0xd0, 0x84, 0x82, 0x65, 0x47, 0x01, 0xc1, 0x48, 0x3b, 0x6b, 0xf3, 0x21, 0x66, 0x83, 0x68, 0x1b, + 0x90, 0x3b, 0xe1, 0x23, 0xd2, 0x2a, 0x37, 0xd3, 0x1a, 0x4f, 0x90, 0xe1, 0x1c, 0x56, 0xf3, 0x33, + 0x0d, 0x2e, 0x28, 0x91, 0x31, 0x18, 0x51, 0x2f, 0x20, 0x67, 0x0d, 0x8d, 0x1d, 0x40, 0x4e, 0xc6, + 0xdc, 0x24, 0x72, 0x8f, 0x69, 0xc6, 0x88, 0x74, 0x9c, 0x64, 0x44, 0x08, 0x8a, 0x43, 0xea, 0x10, + 0xe9, 0x23, 0xfc, 0x1b, 0xbd, 0x04, 0xc6, 0xd0, 0x72, 0xbd, 0xd0, 0x72, 0x3d, 0xe2, 0xf7, 0xc8, + 0x88, 0xda, 0xfb, 0x32, 0x30, 0x9c, 0x4f, 0xf0, 0x1b, 0x0c, 0x6d, 0x3e, 0x85, 0x8b, 0x4d, 0x25, + 0x02, 0x75, 0x48, 0x10, 0x58, 0xfd, 0x33, 0xaf, 0x31, 0x1b, 0xeb, 0xf4, 0xc9, 0x58, 0x67, 0xfe, + 0x5c, 0x83, 0xf3, 0x98, 0x38, 0xb4, 0x43, 0x42, 0x6b, 0x46, 0xd3, 0x1e, 0x17, 0x3e, 0xb3, 0x6a, + 0x15, 0x72, 0xd4, 0xfa, 0x3e, 0xdc, 0x60, 0x5a, 0xe1, 0x98, 0x6b, 0xc7, 0xa7, 0x7d, 0x9f, 0x04, + 0xc1, 0x97, 0xa3, 0xa3, 0xf9, 0x11, 0xac, 0xa4, 0xe7, 0x7f, 0x9b, 0xfa, 0x87, 0x96, 0xef, 0x7c, + 0x49, 0xd3, 0xff, 0x43, 0x53, 0xa3, 0x41, 0x93, 0x7a, 0x8f, 0xdd, 0x3e, 0xaa, 0x43, 0x31, 0x18, + 0x59, 0x9e, 0x9c, 0x6b, 0x29, 0xff, 0x56, 0xc5, 0x9c, 0x86, 0xe5, 0x2e, 0x01, 0xcb, 0x48, 0x62, + 0xe9, 0x11, 0xc8, 0x34, 0x77, 0x94, 0x68, 0x24, 0xcf, 0xf2, 0x11, 0xe1, 0x2a, 0x45, 0xce, 0x02, + 0x62, 0x10, 0x05, 0xc4, 0xa2, 0x08, 0x88, 0x11, 0x1c, 0x1f, 0x82, 0x92, 0x72, 0x08, 0xea, 0x60, + 0x04, 0x07, 0xee, 0xa8, 0xd5, 0xd9, 0x6c, 0x04, 0x5d, 0xa9, 0x51, 0x99, 0x5f, 0x02, 0x13, 0x78, + 0xf3, 0xf7, 0x3a, 0x5c, 0x65, 0xd1, 0xd5, 0x19, 0x0f, 0x94, 0xe0, 0x38, 0xa3, 0x5c, 0xe8, 0x1e, + 0x94, 0x6d, 0x6e, 0xc7, 0x63, 0x22, 0x9e, 0x30, 0x36, 0x96, 0xc4, 0xa8, 0x09, 0x8b, 0x81, 0x54, + 0x49, 0xc4, 0x42, 0x6e, 0xb0, 0xc5, 0xbb, 0xd7, 0x53, 0xec, 0xdd, 0x14, 0x09, 0xce, 0xb0, 0x30, + 0xd5, 0xe9, 0x88, 0xf8, 0x56, 0x48, 0x7d, 0x7e, 0x8f, 0x15, 0xb9, 0x88, 0xb4, 0xea, 0xdb, 0x0a, + 0x01, 0x4e, 0x91, 0xe7, 0x06, 0x92, 0x52, 0x7e, 0x20, 0xf9, 0xb5, 0x0e, 0x4b, 0x1d, 0xe2, 0xf7, + 0x67, 0x6f, 0xbf, 0xb7, 0xa0, 0xe6, 0x9c, 0xf2, 0x2a, 0x4d, 0xd1, 0xa3, 0x36, 0xa0, 0x21, 0xd3, + 0xcc, 0x69, 0x9d, 0xca, 0xfd, 0x72, 0x98, 0x62, 0x47, 0x2b, 0x1e, 0x13, 0x6d, 0xa7, 0x18, 0x69, + 0x07, 0x2e, 0x76, 0x62, 0xd4, 0x83, 0x68, 0x62, 0xf4, 0x0d, 0x25, 0x73, 0xd5, 0x72, 0x2e, 0x82, + 0x84, 0x27, 0x9b, 0xba, 0x9a, 0x9f, 0x6a, 0x50, 0x6b, 0xf9, 0x96, 0xeb, 0x45, 0x61, 0x0a, 0xbd, + 0x08, 0x8b, 0xa1, 0xe5, 0xf7, 0x49, 0xd8, 0xf3, 0xa8, 0x43, 0x7a, 0xae, 0xc3, 0xed, 0x5d, 0xc5, + 0x0b, 0x02, 0xbb, 0x45, 0x1d, 0xd2, 0x76, 0xd0, 0x0b, 0x20, 0x61, 0xa9, 0xb0, 0x38, 0xab, 0xf3, + 0x02, 0xc7, 0x95, 0x45, 0x5f, 0x85, 0x2b, 0x92, 0x24, 0x31, 0x67, 0xcf, 0xa6, 0x63, 0x99, 0xe5, + 0xd5, 0xf0, 0x65, 0x31, 0xac, 0x7a, 0xf0, 0xd8, 0x0b, 0xd1, 0xdb, 0xb0, 0x26, 0xf9, 0x58, 0x06, + 0xe0, 0xf6, 0xf7, 0xc3, 0x9e, 0xc3, 0x34, 0xec, 0x0d, 0xe9, 0x13, 0x22, 0x05, 0x88, 0x57, 0xc8, + 0x8a, 0xa0, 0x6b, 0x4b, 0x32, 0xbe, 0x8e, 0x0e, 0x7d, 0x42, 0xb8, 0x1c, 0xf3, 0x37, 0x05, 0x30, + 0xb2, 0x2b, 0x3f, 0xab, 0x2f, 0xdd, 0x00, 0x60, 0x5f, 0x3d, 0x66, 0x3f, 0xc2, 0x17, 0x5d, 0xc5, + 0x55, 0x86, 0x61, 0xe2, 0x09, 0xba, 0x03, 0x25, 0x31, 0x92, 0x77, 0xd4, 0x9a, 0x74, 0x38, 0xa2, + 0x1e, 0xf1, 0x42, 0x4e, 0x8b, 0x05, 0x25, 0xfa, 0x1f, 0xa8, 0x25, 0xf7, 0x47, 0x2f, 0x8c, 0x33, + 0xf0, 0x54, 0x5e, 0x2f, 0x9f, 0x0d, 0xa5, 0x1c, 0xc7, 0x9d, 0x78, 0x36, 0xa0, 0xff, 0x85, 0xc5, + 0x3d, 0x4a, 0xc3, 0x20, 0xf4, 0xad, 0x51, 0xcf, 0xa1, 0x1e, 0x91, 0x61, 0xab, 0x16, 0x63, 0x5b, + 0xd4, 0x23, 0x13, 0x99, 0x7f, 0x65, 0x32, 0xf3, 0x47, 0x0d, 0x58, 0x14, 0xa6, 0x1f, 0x49, 0xef, + 0x58, 0x9e, 0xe3, 0xf6, 0x4a, 0x27, 0xb2, 0x29, 0xff, 0xc1, 0x35, 0x27, 0xe5, 0x4e, 0x79, 0xde, + 0x5d, 0xcd, 0xf7, 0xee, 0xbf, 0x69, 0x50, 0x63, 0xee, 0x95, 0x38, 0xf6, 0x3d, 0x98, 0x1b, 0xb8, + 0x4f, 0x88, 0xc7, 0x66, 0xd6, 0x72, 0x42, 0x0f, 0xa3, 0xde, 0x94, 0x04, 0x38, 0x26, 0x65, 0xbb, + 0xc4, 0x7d, 0x57, 0x75, 0xcd, 0x2a, 0xc3, 0x08, 0xc7, 0x6c, 0xc1, 0x4d, 0xc5, 0x23, 0xc5, 0x02, + 0x33, 0x2e, 0x5f, 0xe0, 0x3b, 0x7b, 0x3d, 0x21, 0xe3, 0x6b, 0xdc, 0x55, 0x4f, 0x40, 0x03, 0x6e, + 0x4c, 0x93, 0xa2, 0x66, 0x4c, 0xd7, 0x72, 0x65, 0x88, 0x05, 0x7f, 0x00, 0x4b, 0x5d, 0x21, 0x2f, + 0x5e, 0x84, 0x0c, 0x79, 0x77, 0xa0, 0x2c, 0x64, 0x1d, 0xbf, 0x6c, 0x49, 0x78, 0xcc, 0xa2, 0xcd, + 0x21, 0x5c, 0x99, 0x98, 0x4b, 0x26, 0xa4, 0xaf, 0x42, 0xc5, 0x1a, 0x8d, 0x06, 0x2e, 0x71, 0x8e, + 0x9f, 0x2d, 0xa2, 0x3c, 0x6e, 0xba, 0x0f, 0xe0, 0x66, 0x57, 0x3d, 0xda, 0xca, 0xda, 0xa3, 0x35, + 0xce, 0x2a, 0xd0, 0x98, 0x5f, 0x83, 0xeb, 0x4d, 0x4a, 0x7d, 0xc7, 0xf5, 0xd8, 0xc5, 0x73, 0x3f, + 0xf2, 0xf2, 0x68, 0x9e, 0x65, 0xa8, 0x3c, 0x21, 0x7e, 0x10, 0xbd, 0x55, 0x0b, 0x38, 0x02, 0xd9, + 0xd3, 0x65, 0x25, 0x9f, 0x53, 0x5a, 0xe6, 0x3f, 0x0f, 0xac, 0xe8, 0x35, 0x58, 0x8a, 0x8f, 0x4e, + 0x48, 0x6d, 0x3a, 0xe8, 0x45, 0x4a, 0xe8, 0x3c, 0x76, 0x5d, 0x8a, 0x8e, 0x09, 0x1f, 0x7c, 0x4f, + 0x8c, 0xfd, 0xf7, 0xb8, 0xe6, 0x3f, 0x35, 0xb8, 0xd4, 0x70, 0x9c, 0x64, 0x81, 0x91, 0x35, 0x5f, + 0x02, 0x5d, 0xee, 0xd4, 0x91, 0x61, 0x53, 0x77, 0x1d, 0xb4, 0x94, 0x4a, 0x5c, 0x16, 0xe2, 0xcc, + 0x64, 0x22, 0xe4, 0xe5, 0xe4, 0xd1, 0xa8, 0x0e, 0x17, 0xdc, 0xa0, 0xe7, 0x91, 0xc3, 0x5e, 0x12, + 0x80, 0xb9, 0xde, 0x73, 0xf8, 0xbc, 0x1b, 0x6c, 0x91, 0xc3, 0x64, 0x3a, 0x74, 0x13, 0xe6, 0x0f, + 0x64, 0x3d, 0x8a, 0x59, 0xa8, 0x24, 0x4a, 0x54, 0x11, 0xaa, 0xed, 0xe4, 0x06, 0xa1, 0x72, 0x7e, + 0x10, 0xfa, 0x83, 0x06, 0x57, 0x30, 0x61, 0x57, 0xcd, 0x99, 0xd6, 0xbe, 0x0c, 0x15, 0xdb, 0x0a, + 0x6c, 0xcb, 0x21, 0xb2, 0x72, 0x10, 0x81, 0x6c, 0xc4, 0xe7, 0xf2, 0x1d, 0x59, 0xec, 0x88, 0xc0, + 0xec, 0x32, 0x8a, 0x27, 0x5a, 0xc6, 0x94, 0x4c, 0xe1, 0x4f, 0x05, 0xb8, 0x96, 0x2c, 0x60, 0xe2, + 0x4c, 0x9c, 0xf1, 0x1a, 0x9c, 0xb6, 0xb3, 0x57, 0xf9, 0x79, 0xf1, 0x95, 0x4d, 0x8d, 0xb3, 0x77, + 0x1b, 0x5e, 0x08, 0x59, 0xaa, 0xdf, 0x0b, 0x7d, 0xb7, 0xdf, 0x67, 0xea, 0x3f, 0x21, 0x5e, 0x2a, + 0x35, 0x70, 0x4f, 0x50, 0x81, 0xb8, 0xc1, 0x65, 0xec, 0x0a, 0x11, 0x1b, 0x4c, 0x82, 0x5a, 0x8b, + 0xc8, 0x77, 0x9a, 0x52, 0xbe, 0xd3, 0x58, 0x2c, 0xcd, 0x50, 0x15, 0xf2, 0x89, 0x43, 0x33, 0xfa, + 0x94, 0x8f, 0xd3, 0x67, 0x45, 0xd5, 0x87, 0xbd, 0xbb, 0x52, 0xea, 0x64, 0x36, 0xb4, 0x72, 0xa2, + 0x0d, 0x9d, 0xcb, 0xdf, 0xd0, 0x1f, 0x15, 0xe0, 0x7a, 0xee, 0x86, 0xce, 0xa6, 0xaa, 0x70, 0x0f, + 0x4a, 0xec, 0xf9, 0x15, 0x25, 0xc7, 0xe9, 0x72, 0x47, 0x3c, 0x5b, 0xf2, 0x58, 0x13, 0xd4, 0x51, + 0x62, 0x52, 0x38, 0x49, 0x3d, 0xf3, 0x64, 0xa9, 0xce, 0x2b, 0x80, 0xf8, 0x46, 0xa4, 0x29, 0x85, + 0x97, 0x1b, 0x6c, 0x44, 0x2d, 0x37, 0xa0, 0x16, 0x54, 0xa3, 0x07, 0x07, 0x7b, 0x9d, 0x31, 0xd5, + 0xff, 0x2f, 0xf7, 0x7d, 0x33, 0xf1, 0xaa, 0xc0, 0x09, 0x63, 0xee, 0x36, 0x54, 0xf2, 0xb7, 0xe1, + 0x2f, 0x3a, 0xac, 0x26, 0xdb, 0xb0, 0x43, 0x83, 0x70, 0xd6, 0x67, 0xeb, 0x44, 0x07, 0x45, 0x3f, + 0xe3, 0x41, 0xb9, 0x03, 0x15, 0xf1, 0xf8, 0x65, 0xe7, 0x94, 0x59, 0xed, 0xca, 0x84, 0xd5, 0x86, + 0x56, 0xdb, 0x7b, 0x4c, 0x71, 0x44, 0x87, 0x5e, 0x87, 0x05, 0xbe, 0x31, 0x11, 0x5f, 0xf1, 0x68, + 0xbe, 0x79, 0x46, 0xdc, 0x95, 0xbc, 0xa7, 0x08, 0x5c, 0xbf, 0xd2, 0xe1, 0xe6, 0x54, 0x03, 0xcf, + 0xc6, 0xd7, 0xbf, 0x14, 0x0b, 0x9f, 0xea, 0x64, 0x9c, 0xaa, 0xe0, 0x06, 0x89, 0x95, 0x53, 0x55, + 0x5e, 0x2d, 0x53, 0xe5, 0x5d, 0x8d, 0x28, 0xb7, 0xac, 0x61, 0xf4, 0x56, 0x51, 0x30, 0xe8, 0x16, + 0x94, 0xf9, 0x79, 0x8e, 0x5c, 0x20, 0xa7, 0x2e, 0xc3, 0x77, 0x52, 0x52, 0x99, 0x4d, 0xd9, 0x62, + 0xe2, 0x13, 0x4f, 0x6f, 0x31, 0xad, 0x48, 0x32, 0x65, 0xd6, 0x04, 0x61, 0xfe, 0x4e, 0x07, 0x34, + 0x19, 0x4e, 0xd8, 0xcd, 0x3a, 0x65, 0x1f, 0x53, 0x36, 0xd7, 0x65, 0x0b, 0x2b, 0x5a, 0xb2, 0x9e, + 0x59, 0x72, 0x54, 0x68, 0x2a, 0x9c, 0xa0, 0xd0, 0xf4, 0x36, 0x18, 0x76, 0xf4, 0x22, 0xeb, 0x05, + 0x49, 0xad, 0xf7, 0x98, 0x67, 0xdb, 0x79, 0x5b, 0x85, 0xc7, 0xc1, 0x64, 0x54, 0x2b, 0xe5, 0x44, + 0xb5, 0x57, 0x61, 0x7e, 0x6f, 0x40, 0xed, 0x03, 0xf9, 0x70, 0x14, 0xf7, 0x0a, 0x4a, 0x9f, 0x1d, + 0x2e, 0x1e, 0xf6, 0xa2, 0xfa, 0x31, 0x89, 0x8b, 0x05, 0x95, 0xa4, 0x58, 0x60, 0xfe, 0x52, 0x83, + 0xa5, 0xe4, 0x78, 0x34, 0x07, 0x34, 0x20, 0x33, 0x8a, 0x3b, 0x4a, 0x5e, 0xa2, 0xa7, 0xf3, 0x92, + 0x3c, 0xef, 0x2c, 0x4c, 0x39, 0xbd, 0x1a, 0x5c, 0x99, 0x50, 0x6f, 0x36, 0xa7, 0x76, 0x19, 0x2a, + 0xc1, 0xd8, 0xb6, 0xd9, 0x53, 0x50, 0xea, 0x27, 0xc1, 0xd3, 0xe8, 0xf7, 0x13, 0x0d, 0x8c, 0xa4, + 0xdd, 0x20, 0x1c, 0x7b, 0x06, 0xdd, 0x9a, 0x6b, 0x30, 0x27, 0xdd, 0x5f, 0x5c, 0xa0, 0x05, 0x1c, + 0xc3, 0x47, 0x35, 0x62, 0xcc, 0xef, 0x42, 0x89, 0xd3, 0x1d, 0xd3, 0xb1, 0x9d, 0xe6, 0xee, 0x2b, + 0x50, 0xed, 0x8e, 0x06, 0x2e, 0x0f, 0x44, 0x32, 0x99, 0x4c, 0x10, 0xe6, 0xc7, 0x3a, 0x5c, 0xc4, + 0x74, 0x1c, 0x12, 0x2e, 0xaa, 0xe1, 0x0c, 0xdd, 0x80, 0xbf, 0x31, 0xea, 0x60, 0x74, 0xe9, 0xd8, + 0xb7, 0x89, 0x12, 0x1d, 0xc4, 0xcb, 0x6b, 0x02, 0x8f, 0xd6, 0xe1, 0xbc, 0xc0, 0x65, 0x8f, 0x74, + 0x16, 0xcd, 0xa4, 0x8a, 0xf7, 0x83, 0x22, 0x55, 0x3c, 0x55, 0x26, 0xf0, 0x4c, 0xaa, 0xc0, 0x25, + 0x52, 0x8b, 0x42, 0x6a, 0x06, 0x8d, 0xde, 0x84, 0xb2, 0x2c, 0x5e, 0x96, 0xf8, 0x9e, 0xa4, 0x2f, + 0xf7, 0x9c, 0xd5, 0x45, 0x6d, 0x1f, 0xf1, 0x6b, 0x7a, 0xb0, 0x18, 0x59, 0x4b, 0xb8, 0xd6, 0x11, + 0x96, 0x5e, 0x83, 0xf9, 0xed, 0x81, 0x93, 0x31, 0xb6, 0x8a, 0x62, 0x14, 0x5b, 0xe4, 0x30, 0xb3, + 0x9b, 0x2a, 0xca, 0xfc, 0x57, 0x01, 0x4a, 0xe2, 0xf0, 0xae, 0x40, 0xb5, 0x1d, 0xf0, 0x66, 0x90, + 0x7c, 0x56, 0xcf, 0xe1, 0x04, 0xc1, 0xb4, 0xe0, 0x9f, 0x49, 0x95, 0x5b, 0x82, 0xe8, 0x2d, 0x98, + 0x17, 0x9f, 0x51, 0x68, 0x9e, 0x2c, 0xf9, 0x66, 0x1d, 0x18, 0xab, 0x1c, 0xe8, 0x21, 0x5c, 0xd8, + 0x22, 0xc4, 0x69, 0xf9, 0x74, 0x34, 0x8a, 0x28, 0x64, 0x62, 0x7d, 0x8c, 0x98, 0x49, 0x3e, 0xf4, + 0x06, 0x9c, 0x67, 0xc8, 0x86, 0xe3, 0xc4, 0xa2, 0x44, 0x11, 0x0a, 0x4d, 0xc6, 0x56, 0x9c, 0x25, + 0x45, 0x4d, 0x58, 0x7c, 0x77, 0xe4, 0x58, 0x21, 0x91, 0x26, 0x8c, 0x52, 0xb4, 0xeb, 0x79, 0x49, + 0x83, 0xdc, 0x20, 0x9c, 0x61, 0xc9, 0xf6, 0x61, 0x2b, 0x13, 0x7d, 0x58, 0xf4, 0xff, 0xbc, 0xea, + 0xd6, 0x27, 0x3c, 0x75, 0x5e, 0xcc, 0xa4, 0x24, 0x51, 0x3f, 0xae, 0x2f, 0x2a, 0x6e, 0x7d, 0x82, + 0x76, 0xe1, 0x52, 0x8e, 0xe3, 0x04, 0xcb, 0x55, 0xae, 0xdb, 0xda, 0x71, 0x1e, 0x86, 0x73, 0xb9, + 0xcd, 0x1f, 0xc0, 0xa5, 0xf8, 0x86, 0x51, 0x5b, 0xcc, 0xa7, 0xb8, 0xd9, 0xd6, 0xa3, 0xea, 0xa1, + 0x3e, 0xf5, 0x7a, 0x90, 0x45, 0xc3, 0x9c, 0xa6, 0x9d, 0xf9, 0x77, 0x8d, 0x9d, 0xaa, 0xd4, 0x5f, + 0x14, 0x4e, 0x33, 0x79, 0xde, 0x75, 0xa8, 0xcf, 0xe2, 0x3a, 0xcc, 0x7b, 0xdc, 0xdf, 0x81, 0xcb, + 0x22, 0xe7, 0x0a, 0xdc, 0xe7, 0xa4, 0x37, 0x22, 0x7e, 0x2f, 0x20, 0x36, 0xf5, 0xc4, 0x03, 0x50, + 0xc7, 0x88, 0x0f, 0x76, 0xdd, 0xe7, 0x64, 0x87, 0xf8, 0x5d, 0x3e, 0x92, 0xd7, 0xa2, 0x31, 0x7f, + 0xab, 0x01, 0x52, 0x7b, 0xb0, 0xb3, 0xb9, 0x08, 0xdf, 0x81, 0xda, 0x5e, 0x22, 0x34, 0xee, 0xad, + 0xbe, 0x90, 0x9f, 0x4d, 0xa8, 0xf3, 0xa7, 0xf9, 0x72, 0x77, 0xc9, 0x81, 0x05, 0x35, 0xfd, 0x63, + 0x34, 0xa1, 0x1b, 0x07, 0x60, 0xfe, 0xcd, 0x70, 0x1e, 0x75, 0xa2, 0x48, 0xcb, 0xbf, 0x19, 0xce, + 0x8e, 0x64, 0x55, 0x31, 0xff, 0x66, 0x41, 0x64, 0x28, 0xba, 0x7a, 0x32, 0x7c, 0x46, 0xa0, 0xf9, + 0x1a, 0x2c, 0x64, 0xdb, 0x0e, 0xfb, 0x6e, 0x7f, 0x5f, 0xfe, 0xc7, 0x81, 0x7f, 0x23, 0x03, 0x0a, + 0x03, 0x7a, 0x28, 0xc3, 0x0f, 0xfb, 0x64, 0xba, 0xa9, 0x66, 0x39, 0x19, 0x17, 0xd7, 0x36, 0x09, + 0xf6, 0xfc, 0x9b, 0x5d, 0x5a, 0xd1, 0x2b, 0x57, 0xaa, 0x16, 0xc3, 0xe6, 0xf7, 0xe0, 0xe6, 0x26, + 0xed, 0x2b, 0x65, 0xb7, 0xa4, 0x55, 0x39, 0x9b, 0x0d, 0x34, 0x3f, 0xd6, 0x60, 0x6d, 0xfa, 0x14, + 0xb3, 0xc9, 0x46, 0x8e, 0xeb, 0x83, 0x0e, 0x98, 0x2d, 0x89, 0x7d, 0x10, 0x8c, 0x87, 0x1d, 0x12, + 0x5a, 0xe8, 0x2b, 0xd1, 0xd9, 0xce, 0xcb, 0x2d, 0x22, 0xca, 0xd4, 0x19, 0xaf, 0x83, 0x61, 0xab, + 0xf8, 0x2e, 0xf9, 0x50, 0xce, 0x33, 0x81, 0x37, 0x7f, 0xa6, 0xc1, 0x65, 0xa5, 0xdb, 0x4f, 0xc2, + 0x48, 0x22, 0xba, 0x04, 0x25, 0xd1, 0x31, 0x11, 0x9b, 0x28, 0x00, 0xe6, 0x39, 0x4f, 0xa9, 0xff, + 0x80, 0x6d, 0xae, 0xbc, 0x7e, 0x24, 0x88, 0x96, 0xa0, 0xfc, 0x94, 0xfa, 0x9b, 0xf4, 0x50, 0x9e, + 0x5b, 0x09, 0x89, 0xec, 0x6b, 0xc8, 0x39, 0x8a, 0xb2, 0xb0, 0x23, 0x40, 0xc6, 0x11, 0x8c, 0x87, + 0x8c, 0x43, 0x24, 0xbe, 0x12, 0x62, 0xa9, 0xe0, 0x5a, 0xae, 0x4e, 0x0d, 0xfb, 0x60, 0x56, 0xbb, + 0x70, 0x09, 0x4a, 0x6a, 0x55, 0x58, 0x00, 0xb9, 0x7f, 0x69, 0x90, 0xff, 0x7c, 0x2a, 0xc6, 0xff, + 0x7c, 0x32, 0xff, 0xaa, 0x81, 0x99, 0xab, 0x9f, 0xb8, 0x7f, 0x66, 0x14, 0x4c, 0xce, 0xa0, 0x21, + 0x7a, 0x13, 0xe6, 0xa2, 0x9d, 0xe6, 0xb6, 0xcd, 0xfe, 0x6f, 0x26, 0x57, 0x7b, 0x1c, 0xf3, 0xd4, + 0x6f, 0x44, 0xc9, 0x13, 0xaa, 0x42, 0xe9, 0x91, 0xef, 0x86, 0xc4, 0x38, 0x87, 0xe6, 0xa0, 0xb8, + 0x63, 0x05, 0x81, 0xa1, 0xd5, 0xd7, 0x45, 0x6e, 0xa4, 0x74, 0x7b, 0x01, 0xca, 0x4d, 0x9f, 0x58, + 0x9c, 0x0e, 0xa0, 0x2c, 0xca, 0xa0, 0x86, 0x56, 0xef, 0xc0, 0x82, 0xda, 0xe4, 0x65, 0xe2, 0xb6, + 0x7b, 0x0d, 0xc7, 0x31, 0xce, 0xa1, 0x05, 0x98, 0xdb, 0xee, 0x45, 0x84, 0x8c, 0x69, 0xbb, 0xd7, + 0x61, 0xdf, 0x3a, 0x9a, 0x87, 0xca, 0x76, 0x8f, 0x67, 0xa3, 0x46, 0x41, 0x00, 0xbc, 0xbd, 0x6b, + 0x14, 0xeb, 0xf7, 0x60, 0x41, 0xed, 0x29, 0x30, 0x71, 0x8d, 0xcd, 0xf6, 0x7b, 0x1b, 0x42, 0x5c, + 0x0b, 0x37, 0xda, 0x5b, 0xed, 0xad, 0x77, 0x0c, 0x8d, 0x41, 0xdd, 0xdd, 0xed, 0x9d, 0x1d, 0x06, + 0xe9, 0xf5, 0xd7, 0x01, 0x92, 0xcb, 0x9c, 0xad, 0x63, 0x6b, 0x7b, 0x8b, 0xf1, 0xcc, 0x43, 0xe5, + 0x51, 0xa3, 0xbd, 0x2b, 0x58, 0x18, 0x80, 0x05, 0xa0, 0x33, 0x9a, 0x16, 0xa3, 0x29, 0xd4, 0x5f, + 0xc9, 0xa4, 0xf8, 0xa8, 0x02, 0x85, 0xc6, 0x60, 0x60, 0x9c, 0x43, 0x65, 0xd0, 0x5b, 0xf7, 0x85, + 0xea, 0x5b, 0xd4, 0x1f, 0x5a, 0x03, 0x43, 0xaf, 0xbf, 0x03, 0x57, 0xa7, 0xa6, 0x96, 0x5c, 0xdb, + 0x56, 0xa7, 0xbd, 0x2b, 0x66, 0xc6, 0x1b, 0x9b, 0x1b, 0x8d, 0xee, 0x86, 0xa1, 0x21, 0x04, 0x8b, + 0x12, 0xe8, 0x75, 0x9b, 0x0f, 0x36, 0x3a, 0x0d, 0x43, 0xaf, 0x3f, 0x87, 0xc5, 0xf4, 0x7d, 0xc9, + 0xf5, 0xa3, 0xfe, 0x81, 0xeb, 0xf5, 0x05, 0x7f, 0x37, 0xe4, 0xe9, 0x96, 0xd0, 0x5c, 0xd8, 0xd1, + 0x31, 0x74, 0x64, 0xc0, 0x42, 0xdb, 0x73, 0x43, 0xd7, 0x1a, 0xb8, 0xcf, 0x19, 0x6d, 0x01, 0xd5, + 0xa0, 0xba, 0xe3, 0x93, 0x91, 0xe5, 0x33, 0xb0, 0x88, 0x16, 0x01, 0xb8, 0x39, 0x31, 0xb1, 0x9c, + 0x67, 0x46, 0x89, 0x31, 0x3c, 0xb2, 0xdc, 0xd0, 0xf5, 0xfa, 0xc2, 0xca, 0xe5, 0xfa, 0x37, 0xa1, + 0x96, 0x8a, 0x2b, 0xe8, 0x02, 0xd4, 0xde, 0xdd, 0x6a, 0x6f, 0xb5, 0x77, 0xdb, 0x8d, 0xcd, 0xf6, + 0xfb, 0x1b, 0x2d, 0x61, 0xee, 0x4e, 0xbb, 0xdb, 0x69, 0xec, 0x36, 0x1f, 0x18, 0x1a, 0x5b, 0x99, + 0xf8, 0xd4, 0xef, 0xbf, 0xf9, 0xc7, 0xcf, 0x57, 0xb5, 0x4f, 0x3e, 0x5f, 0xd5, 0x3e, 0xfb, 0x7c, + 0x55, 0xfb, 0xe9, 0x17, 0xab, 0xe7, 0x3e, 0xf9, 0x62, 0xf5, 0xdc, 0xa7, 0x5f, 0xac, 0x9e, 0x7b, + 0xff, 0xc5, 0xbe, 0x1b, 0xee, 0x8f, 0xf7, 0x6e, 0xd9, 0x74, 0x78, 0x7b, 0xe4, 0x7a, 0x7d, 0xdb, + 0x1a, 0xdd, 0x0e, 0x5d, 0xdb, 0xb1, 0x6f, 0x2b, 0xae, 0xb9, 0x57, 0xe6, 0x5d, 0x8f, 0x57, 0xff, + 0x1d, 0x00, 0x00, 0xff, 0xff, 0xc0, 0xa9, 0xa2, 0xee, 0xc1, 0x2a, 0x00, 0x00, } func (m *TableSpan) Marshal() (dAtA []byte, err error) { @@ -4522,6 +4623,11 @@ func (m *HeartBeatResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x20 + } if m.Mode != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.Mode)) i-- @@ -4813,6 +4919,11 @@ func (m *ScheduleDispatcherRequest) MarshalToSizedBuffer(dAtA []byte) (int, erro _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x28 + } if m.OperatorType != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.OperatorType)) i-- @@ -4870,6 +4981,11 @@ func (m *MergeDispatcherRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x28 + } if m.Mode != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.Mode)) i-- @@ -5018,6 +5134,11 @@ func (m *MaintainerStatus) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x48 + } if m.DrainProgress != nil { { size, err := m.DrainProgress.MarshalToSizedBuffer(dAtA[:i]) @@ -5339,6 +5460,11 @@ func (m *AddMaintainerRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x30 + } if m.KeyspaceId != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.KeyspaceId)) i-- @@ -5401,6 +5527,11 @@ func (m *RemoveMaintainerRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x28 + } if m.KeyspaceId != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.KeyspaceId)) i-- @@ -5461,6 +5592,11 @@ func (m *MaintainerBootstrapRequest) MarshalToSizedBuffer(dAtA []byte) (int, err _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x40 + } if m.KeyspaceId != 0 { i = encodeVarintHeartbeat(dAtA, i, uint64(m.KeyspaceId)) i-- @@ -5547,6 +5683,11 @@ func (m *MaintainerBootstrapResponse) MarshalToSizedBuffer(dAtA []byte) (int, er _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x38 + } if len(m.Operators) > 0 { for iNdEx := len(m.Operators) - 1; iNdEx >= 0; iNdEx-- { { @@ -5632,6 +5773,11 @@ func (m *MaintainerPostBootstrapRequest) MarshalToSizedBuffer(dAtA []byte) (int, _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x28 + } if len(m.RedoSchemas) > 0 { for iNdEx := len(m.RedoSchemas) - 1; iNdEx >= 0; iNdEx-- { { @@ -5707,6 +5853,11 @@ func (m *MaintainerPostBootstrapResponse) MarshalToSizedBuffer(dAtA []byte) (int _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x20 + } if m.Err != nil { { size, err := m.Err.MarshalToSizedBuffer(dAtA[:i]) @@ -5929,6 +6080,11 @@ func (m *MaintainerCloseRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x18 + } if m.Removed { i-- if m.Removed { @@ -5974,6 +6130,11 @@ func (m *MaintainerCloseResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) _ = i var l int _ = l + if m.MaintainerEpoch != 0 { + i = encodeVarintHeartbeat(dAtA, i, uint64(m.MaintainerEpoch)) + i-- + dAtA[i] = 0x18 + } if m.Success { i-- if m.Success { @@ -7074,6 +7235,9 @@ func (m *HeartBeatResponse) Size() (n int) { if m.Mode != 0 { n += 1 + sovHeartbeat(uint64(m.Mode)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7193,6 +7357,9 @@ func (m *ScheduleDispatcherRequest) Size() (n int) { if m.OperatorType != 0 { n += 1 + sovHeartbeat(uint64(m.OperatorType)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7219,6 +7386,9 @@ func (m *MergeDispatcherRequest) Size() (n int) { if m.Mode != 0 { n += 1 + sovHeartbeat(uint64(m.Mode)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7295,6 +7465,9 @@ func (m *MaintainerStatus) Size() (n int) { l = m.DrainProgress.Size() n += 1 + l + sovHeartbeat(uint64(l)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7426,6 +7599,9 @@ func (m *AddMaintainerRequest) Size() (n int) { if m.KeyspaceId != 0 { n += 1 + sovHeartbeat(uint64(m.KeyspaceId)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7448,6 +7624,9 @@ func (m *RemoveMaintainerRequest) Size() (n int) { if m.KeyspaceId != 0 { n += 1 + sovHeartbeat(uint64(m.KeyspaceId)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7482,6 +7661,9 @@ func (m *MaintainerBootstrapRequest) Size() (n int) { if m.KeyspaceId != 0 { n += 1 + sovHeartbeat(uint64(m.KeyspaceId)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7517,6 +7699,9 @@ func (m *MaintainerBootstrapResponse) Size() (n int) { n += 1 + l + sovHeartbeat(uint64(l)) } } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7546,6 +7731,9 @@ func (m *MaintainerPostBootstrapRequest) Size() (n int) { n += 1 + l + sovHeartbeat(uint64(l)) } } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7567,6 +7755,9 @@ func (m *MaintainerPostBootstrapResponse) Size() (n int) { l = m.Err.Size() n += 1 + l + sovHeartbeat(uint64(l)) } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7654,6 +7845,9 @@ func (m *MaintainerCloseRequest) Size() (n int) { if m.Removed { n += 2 } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -7670,6 +7864,9 @@ func (m *MaintainerCloseResponse) Size() (n int) { if m.Success { n += 2 } + if m.MaintainerEpoch != 0 { + n += 1 + sovHeartbeat(uint64(m.MaintainerEpoch)) + } return n } @@ -9344,6 +9541,25 @@ func (m *HeartBeatResponse) Unmarshal(dAtA []byte) error { break } } + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -10142,6 +10358,25 @@ func (m *ScheduleDispatcherRequest) Unmarshal(dAtA []byte) error { break } } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -10317,6 +10552,25 @@ func (m *MergeDispatcherRequest) Unmarshal(dAtA []byte) error { break } } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -10805,6 +11059,25 @@ func (m *MaintainerStatus) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 9: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -11622,6 +11895,25 @@ func (m *AddMaintainerRequest) Unmarshal(dAtA []byte) error { break } } + case 6: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -11767,6 +12059,25 @@ func (m *RemoveMaintainerRequest) Unmarshal(dAtA []byte) error { break } } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -12017,6 +12328,25 @@ func (m *MaintainerBootstrapRequest) Unmarshal(dAtA []byte) error { break } } + case 8: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -12245,6 +12575,25 @@ func (m *MaintainerBootstrapResponse) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -12435,6 +12784,25 @@ func (m *MaintainerPostBootstrapRequest) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -12593,6 +12961,25 @@ func (m *MaintainerPostBootstrapResponse) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -13169,6 +13556,25 @@ func (m *MaintainerCloseRequest) Unmarshal(dAtA []byte) error { } } m.Removed = bool(v != 0) + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) @@ -13275,6 +13681,25 @@ func (m *MaintainerCloseResponse) Unmarshal(dAtA []byte) error { } } m.Success = bool(v != 0) + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaintainerEpoch", wireType) + } + m.MaintainerEpoch = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowHeartbeat + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaintainerEpoch |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipHeartbeat(dAtA[iNdEx:]) diff --git a/heartbeatpb/heartbeat.proto b/heartbeatpb/heartbeat.proto index 419c1bd7fe..afdd75f7cf 100644 --- a/heartbeatpb/heartbeat.proto +++ b/heartbeatpb/heartbeat.proto @@ -72,6 +72,8 @@ message HeartBeatResponse { ChangefeedID changefeedID = 1; repeated DispatcherStatus dispatcherStatuses = 2; int64 mode = 3; + // maintainer_epoch fences barrier decisions from stale maintainers. + uint64 maintainer_epoch = 4; } message CheckpointTsMessage { @@ -132,6 +134,7 @@ message ScheduleDispatcherRequest { DispatcherConfig config = 2; ScheduleAction scheduleAction = 3; OperatorType operatorType = 4; + uint64 maintainer_epoch = 5; } message MergeDispatcherRequest { @@ -139,6 +142,7 @@ message MergeDispatcherRequest { repeated DispatcherID dispatcherIDs = 2; // dispatcherIDs from the dispatchers that need to be merged DispatcherID mergedDispatcherID = 3; // the dispatcherID of the new dispatcher to be merged to. int64 mode = 4; + uint64 maintainer_epoch = 5; } message MaintainerHeartbeat { @@ -172,6 +176,7 @@ message MaintainerStatus { // drain_progress reports the active dispatcher drain target observed by this maintainer. // Nil means no active dispatcher drain target. DrainProgress drain_progress = 8; + uint64 maintainer_epoch = 9; } // NodeLiveness is node-reported liveness. @@ -243,6 +248,7 @@ message AddMaintainerRequest { uint64 checkpoint_ts = 3; bool is_new_changefeed = 4; // only true when the changefeed is new created or resumed with overwriteCheckpointTs uint32 keyspace_id = 5; + uint64 maintainer_epoch = 6; } message RemoveMaintainerRequest { @@ -250,6 +256,7 @@ message RemoveMaintainerRequest { bool cascade = 2; bool removed = 3; uint32 keyspace_id = 4; + uint64 maintainer_epoch = 5; } message MaintainerBootstrapRequest { @@ -260,6 +267,7 @@ message MaintainerBootstrapRequest { bool is_new_changefeed = 5; // only true when the changefeed is new created or resumed with overwriteCheckpointTs DispatcherID table_trigger_redo_dispatcher_id = 6; // only for redo uint32 keyspace_id = 7; + uint64 maintainer_epoch = 8; } message MaintainerBootstrapResponse { @@ -279,6 +287,7 @@ message MaintainerBootstrapResponse { // It will be used when redo enable. uint64 redo_checkpoint_ts = 5; repeated ScheduleDispatcherRequest operators = 6; + uint64 maintainer_epoch = 7; } message MaintainerPostBootstrapRequest { @@ -286,12 +295,14 @@ message MaintainerPostBootstrapRequest { DispatcherID table_trigger_event_dispatcher_id = 2; repeated SchemaInfo schemas = 3; repeated SchemaInfo redo_schemas = 4; + uint64 maintainer_epoch = 5; } message MaintainerPostBootstrapResponse { ChangefeedID changefeedID = 1; DispatcherID table_trigger_event_dispatcher_id = 2; RunningError err = 3; + uint64 maintainer_epoch = 4; } message SchemaInfo { @@ -326,11 +337,13 @@ message MaintainerCloseRequest { ChangefeedID changefeedID = 1; // true when remove changefeed, false when pause the changefeed. bool removed = 2; + uint64 maintainer_epoch = 3; } message MaintainerCloseResponse { ChangefeedID changefeedID = 1; bool success = 2; + uint64 maintainer_epoch = 3; } enum InfluenceType { diff --git a/pkg/common/format.go b/pkg/common/format.go index 4a9cdbb3d9..2baf2d3329 100644 --- a/pkg/common/format.go +++ b/pkg/common/format.go @@ -111,12 +111,13 @@ func FormatMaintainerStatus(s *heartbeatpb.MaintainerStatus) string { return "" } sb := strings.Builder{} - fmt.Fprintf(&sb, "changefeed: %s, feedState: %s, state: %s, checkpointTs: %d, bootstrapDone: %t, errs: [", + fmt.Fprintf(&sb, "changefeed: %s, feedState: %s, state: %s, checkpointTs: %d, bootstrapDone: %t, maintainerEpoch: %d, errs: [", s.ChangefeedID.GetName(), s.FeedState, s.State.String(), s.CheckpointTs, - s.BootstrapDone) + s.BootstrapDone, + s.MaintainerEpoch) for _, err := range s.Err { sb.WriteString(err.String()) } diff --git a/pkg/common/maintainer_epoch.go b/pkg/common/maintainer_epoch.go new file mode 100644 index 0000000000..856ee9c801 --- /dev/null +++ b/pkg/common/maintainer_epoch.go @@ -0,0 +1,23 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package common + +// MaintainerEpochMatches keeps rolling-upgrade compatibility while enforcing +// exact owner epochs after upgraded maintainers report them. Epoch 0 means +// either side predates the maintainer epoch field, so it stays accepted during +// mixed-version rollout. This compatibility gate is not intended to fence every +// mixed-version race, only stale non-zero epochs after rollout completes. +func MaintainerEpochMatches(reportedEpoch, currentEpoch uint64) bool { + return reportedEpoch == 0 || currentEpoch == 0 || reportedEpoch == currentEpoch +} diff --git a/pkg/pdutil/utils.go b/pkg/pdutil/utils.go index ce4e952fa5..42a43f4757 100644 --- a/pkg/pdutil/utils.go +++ b/pkg/pdutil/utils.go @@ -66,3 +66,14 @@ func GenerateChangefeedEpoch(ctx context.Context, pdClient pd.Client) uint64 { } return oracle.ComposeTS(phyTs, logical) } + +// AdvanceChangefeedEpoch returns max(candidate, current+1). +func AdvanceChangefeedEpoch(candidate, current uint64) (uint64, error) { + if candidate > current { + return candidate, nil + } + if current == ^uint64(0) { + return 0, cerror.ErrSchedulerRequestFailed.GenWithStackByArgs("changefeed epoch overflow") + } + return current + 1, nil +} diff --git a/pkg/pdutil/utils_test.go b/pkg/pdutil/utils_test.go index 235466bd0d..addde9e6a6 100644 --- a/pkg/pdutil/utils_test.go +++ b/pkg/pdutil/utils_test.go @@ -45,3 +45,19 @@ func TestGetSourceID(t *testing.T) { return sourceID == 2 }, 5*time.Second, 100*time.Millisecond) } + +func TestAdvanceChangefeedEpoch(t *testing.T) { + t.Parallel() + + epoch, err := AdvanceChangefeedEpoch(10, 8) + require.NoError(t, err) + require.Equal(t, uint64(10), epoch) + + epoch, err = AdvanceChangefeedEpoch(10, 12) + require.NoError(t, err) + require.Equal(t, uint64(13), epoch) + + _, err = AdvanceChangefeedEpoch(10, ^uint64(0)) + require.Error(t, err) + require.ErrorContains(t, err, "changefeed epoch overflow") +} From 38669c88b7f810feaf7be94670cb03ae231d4132 Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Wed, 17 Jun 2026 22:23:41 +0800 Subject: [PATCH 2/5] maintainer,dispatcher: fence stale maintainer epochs --- .../dispatchermanager/dispatcher_manager.go | 25 ++- .../dispatcher_manager_helper.go | 25 ++- .../dispatcher_manager_info.go | 38 +++- .../dispatcher_manager_redo.go | 2 +- .../dispatcher_manager_test.go | 20 +- .../dispatchermanager/heartbeat_collector.go | 6 +- downstreamadapter/dispatchermanager/helper.go | 162 +++++++++----- .../dispatchermanager/helper_test.go | 161 +++++++++++++- .../dispatcher_orchestrator.go | 178 ++++++++++----- .../dispatcherorchestrator/helper.go | 39 +++- maintainer/barrier.go | 2 + maintainer/barrier_event.go | 6 +- maintainer/barrier_test.go | 2 +- maintainer/maintainer.go | 100 ++++++++- maintainer/maintainer_controller.go | 33 ++- maintainer/maintainer_controller_bootstrap.go | 11 +- maintainer/maintainer_controller_helper.go | 9 +- maintainer/maintainer_controller_test.go | 68 ++++-- maintainer/maintainer_manager_maintainers.go | 180 ++++++++++++--- maintainer/maintainer_manager_test.go | 208 ++++++++++++++++++ maintainer/maintainer_test.go | 79 ++++++- maintainer/operator/operator_add.go | 17 +- maintainer/operator/operator_add_test.go | 12 +- maintainer/operator/operator_controller.go | 44 +++- .../operator/operator_controller_test.go | 19 +- maintainer/operator/operator_merge.go | 12 +- maintainer/operator/operator_merge_test.go | 14 +- maintainer/operator/operator_move.go | 32 ++- maintainer/operator/operator_move_test.go | 15 +- maintainer/operator/operator_remove.go | 34 +-- maintainer/operator/operator_remove_test.go | 10 +- maintainer/operator/operator_split.go | 5 +- maintainer/operator/operator_split_test.go | 14 +- maintainer/replica/replication_span.go | 34 ++- maintainer/replica/replication_span_test.go | 20 +- maintainer/scheduler/balance.go | 11 +- maintainer/scheduler/balance_splits.go | 21 +- maintainer/scheduler/basic.go | 8 +- maintainer/scheduler/drain.go | 2 +- maintainer/scheduler/drain_test.go | 2 +- maintainer/span/span_controller_test.go | 2 +- 41 files changed, 1344 insertions(+), 338 deletions(-) diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager.go b/downstreamadapter/dispatchermanager/dispatcher_manager.go index fb9d85deb2..ebb06ab50f 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager.go @@ -99,6 +99,9 @@ type DispatcherManager struct { maintainerEpoch uint64 maintainerID node.ID } + // MaintainerFenceMu serializes maintainer owner/epoch changes with request + // fence checks and scheduler side effects. + MaintainerFenceMu sync.Mutex pdClock pdutil.Clock @@ -112,15 +115,12 @@ type DispatcherManager struct { dispatcherMap *DispatcherMap[*dispatcher.EventDispatcher] // redoDispatcherMap restore all the redo dispatchers in the DispatcherManager, including table trigger redo dispatcher redoDispatcherMap *DispatcherMap[*dispatcher.RedoDispatcher] - // currentOperatorMap stores at most one in-flight scheduling request per dispatcherID (event and redo). + // currentOperatorMap stores one in-flight scheduling request per dispatcherID. // - // It is used for: - // - suppressing duplicate maintainer requests for the same dispatcher, - // - reporting unfinished requests during bootstrap so a new maintainer can restore operators, - // - cleaning up remove requests when a dispatcher is fully removed. - // - // Entries must be deleted on completion (create -> after creation; remove -> on cleanup), otherwise - // future maintainer requests for the same dispatcherID will be ignored. + // The value carries sender and maintainer epoch so bootstrap recovery can + // return only current-epoch operators, and precheck can replace stale entries. + // Entries must be deleted on completion, otherwise future requests for the + // same dispatcherID will be ignored. currentOperatorMap sync.Map // map[common.DispatcherID]SchedulerDispatcherRequest (in dispatcher manager, not heartbeatpb) // schemaIDToDispatchers is shared in the DispatcherManager, // it store all the infos about schemaID->Dispatchers @@ -208,6 +208,7 @@ func NewDispatcherManager( tableTriggerRedoDispatcherID *heartbeatpb.DispatcherID, startTs uint64, maintainerID node.ID, + maintainerEpoch uint64, newChangefeed bool, registerInitializing func(*DispatcherManager) bool, ) (manager *DispatcherManager, err error) { @@ -255,8 +256,10 @@ func NewDispatcherManager( metricRedoCreateDispatcherDuration: metrics.CreateDispatcherDuration.WithLabelValues(changefeedID.Keyspace(), changefeedID.Name(), "redoDispatcher"), } - // Set the epoch and maintainerID of the event dispatcher manager - manager.meta.maintainerEpoch = cfConfig.Epoch + // Trust only the explicit request maintainer epoch for receiver fencing. The + // config epoch may be newer than an old rolling-upgrade request and must not + // turn epoch 0 compatibility traffic into strict-mode traffic. + manager.meta.maintainerEpoch = maintainerEpoch manager.meta.maintainerID = maintainerID cleanupManager := manager defer func() { @@ -427,7 +430,7 @@ func (e *DispatcherManager) NewTableTriggerEventDispatcher(id *heartbeatpb.Dispa infos := map[common.DispatcherID]dispatcherCreateInfo{} dispatcherID := common.NewDispatcherIDFromPB(id) infos[dispatcherID] = dispatcherCreateInfo{ - Id: dispatcherID, + ID: dispatcherID, TableSpan: common.KeyspaceDDLSpan(e.keyspaceID), StartTs: startTs, SchemaID: 0, diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_helper.go b/downstreamadapter/dispatchermanager/dispatcher_manager_helper.go index 64731249cd..31daf67bb8 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_helper.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_helper.go @@ -82,7 +82,7 @@ func prepareCreateDispatcher[T dispatcher.Dispatcher](infos map[common.Dispatche schemaIds := make([]int64, 0, len(infos)) skipDMLAsStartTsList := make([]bool, 0, len(infos)) for _, info := range infos { - id := info.Id + id := info.ID if _, ok := dispatcherMap.Get(id); ok { continue } @@ -266,17 +266,7 @@ func removeDispatcher[T dispatcher.Dispatcher](e *DispatcherManager, } } - // Submit async remove task to thread pool - task := &RemoveDispatcherTask{ - manager: e, - dispatcherItem: dispatcherItem, - retryCount: 0, - } - scheduler := GetRemoveDispatcherTaskScheduler() - taskHandle := scheduler.Submit(task, time.Now()) - - // Save taskHandle for later cancellation - e.removeTaskHandles.Store(id, taskHandle) + e.submitRemoveDispatcherTask(dispatcherItem) dispatcherItem.SetTryRemoving() @@ -296,6 +286,17 @@ func removeDispatcher[T dispatcher.Dispatcher](e *DispatcherManager, } } +func (e *DispatcherManager) submitRemoveDispatcherTask(dispatcherItem dispatcher.Dispatcher) { + task := &RemoveDispatcherTask{ + manager: e, + dispatcherItem: dispatcherItem, + retryCount: 0, + } + scheduler := GetRemoveDispatcherTaskScheduler() + taskHandle := scheduler.Submit(task, time.Now()) + e.removeTaskHandles.Store(dispatcherItem.GetId(), taskHandle) +} + // closeAllDispatchers is called when the event dispatcher manager is closing func closeAllDispatchers[T dispatcher.Dispatcher](changefeedID common.ChangeFeedID, dispatcherMap *DispatcherMap[T], diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_info.go b/downstreamadapter/dispatchermanager/dispatcher_manager_info.go index 07a5f28910..1728b5fe30 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_info.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_info.go @@ -22,10 +22,10 @@ import ( "github.com/pingcap/ticdc/pkg/node" ) -// event_dispatcher_mananger_info.go is used to store the basic info and function of the event dispatcher manager +// dispatcher_manager_info.go stores the basic info and functions of the dispatcher manager. type dispatcherCreateInfo struct { - Id common.DispatcherID + ID common.DispatcherID TableSpan *heartbeatpb.TableSpan StartTs uint64 SchemaID int64 @@ -52,10 +52,40 @@ func (e *DispatcherManager) GetMaintainerID() node.ID { return e.meta.maintainerID } -func (e *DispatcherManager) SetMaintainerID(maintainerID node.ID) { +// TryUpdateMaintainer records the active maintainer owner and epoch. +// Maintainer epoch 0 is accepted only while the manager is still in compatibility +// mode. Once a non-zero epoch is known, epoch 0 must never downgrade the receiver +// back to compatibility mode. +func (e *DispatcherManager) TryUpdateMaintainer(from node.ID, maintainerEpoch uint64) bool { e.meta.Lock() defer e.meta.Unlock() - e.meta.maintainerID = maintainerID + if maintainerEpoch == 0 { + if e.meta.maintainerEpoch != 0 { + return false + } + e.meta.maintainerID = from + return true + } + if e.meta.maintainerEpoch > maintainerEpoch { + return false + } + if e.meta.maintainerEpoch == maintainerEpoch && e.meta.maintainerID != "" && e.meta.maintainerID != from { + return false + } + e.meta.maintainerEpoch = maintainerEpoch + e.meta.maintainerID = from + return true +} + +// IsMaintainerRequestAllowed reports whether a request belongs to the current +// maintainer owner/epoch view known by this dispatcher manager. +func (e *DispatcherManager) IsMaintainerRequestAllowed(from node.ID, maintainerEpoch uint64) bool { + e.meta.Lock() + defer e.meta.Unlock() + if maintainerEpoch == 0 { + return e.meta.maintainerEpoch == 0 && (e.meta.maintainerID == "" || e.meta.maintainerID == from) + } + return e.meta.maintainerEpoch == maintainerEpoch && e.meta.maintainerID == from } func (e *DispatcherManager) GetMaintainerEpoch() uint64 { diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_redo.go b/downstreamadapter/dispatchermanager/dispatcher_manager_redo.go index 87d1ba15ad..7ae077afb2 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_redo.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_redo.go @@ -125,7 +125,7 @@ func (e *DispatcherManager) NewTableTriggerRedoDispatcher(id *heartbeatpb.Dispat infos := map[common.DispatcherID]dispatcherCreateInfo{} dispatcherID := common.NewDispatcherIDFromPB(id) infos[dispatcherID] = dispatcherCreateInfo{ - Id: dispatcherID, + ID: dispatcherID, TableSpan: common.KeyspaceDDLSpan(e.keyspaceID), StartTs: startTs, SchemaID: 0, diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go index 9dbdd0a365..f46a746e52 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go @@ -24,7 +24,6 @@ import ( "github.com/pingcap/ticdc/downstreamadapter/eventcollector" "github.com/pingcap/ticdc/downstreamadapter/sink" "github.com/pingcap/ticdc/downstreamadapter/sink/mock" - "github.com/pingcap/ticdc/downstreamadapter/sink/mysql" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/logservice/schemastore" "github.com/pingcap/ticdc/pkg/common" @@ -37,7 +36,6 @@ import ( "github.com/pingcap/ticdc/pkg/node" "github.com/pingcap/ticdc/pkg/pdutil" "github.com/pingcap/ticdc/pkg/routing" - mysqlcfg "github.com/pingcap/ticdc/pkg/sink/mysql" "github.com/pingcap/ticdc/pkg/util" "github.com/pingcap/ticdc/utils/threadpool" "github.com/stretchr/testify/require" @@ -455,20 +453,9 @@ func TestMergeDispatcherInvalidIDs(t *testing.T) { func TestTryCloseRemovedRequestAfterClosedReturnsImmediatelyAndTriggersCleanup(t *testing.T) { changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - mysqlConfig := mysqlcfg.New() - mysqlConfig.EnableDDLTs = false - mysqlSink := mysql.NewMySQLSink( - context.Background(), - changefeedID, - mysqlConfig, - nil, - false, - false, - time.Minute, - ) manager := &DispatcherManager{ changefeedID: changefeedID, - sink: mysqlSink, + sink: newDispatcherManagerTestSink(t, common.BlackHoleSinkType), } manager.closed.Store(true) @@ -619,6 +606,7 @@ func TestNewDispatcherManagerReturnsFenceErrorWhenInitializingRegistrationReject nil, 1, node.ID("maintainer"), + 1, true, func(manager *DispatcherManager) bool { hookCalled.Store(true) @@ -750,7 +738,7 @@ func TestCreateDispatcherByInfoKeepsCreateOperatorWhenFenced(t *testing.T) { manager := createTestManager(t) manager.writePathClosed.Store(true) dispatcherID := common.NewDispatcherID() - createReq := NewSchedulerDispatcherRequest(&heartbeatpb.ScheduleDispatcherRequest{ + createReq := NewSchedulerDispatcherRequest(node.ID("maintainer"), &heartbeatpb.ScheduleDispatcherRequest{ ChangefeedID: manager.changefeedID.ToPB(), Config: &heartbeatpb.DispatcherConfig{ DispatcherID: dispatcherID.ToPB(), @@ -767,7 +755,7 @@ func TestCreateDispatcherByInfoKeepsCreateOperatorWhenFenced(t *testing.T) { createDispatcherByInfo(manager, map[common.DispatcherID]dispatcherCreateInfo{ dispatcherID: { - Id: dispatcherID, + ID: dispatcherID, TableSpan: &heartbeatpb.TableSpan{ TableID: 1, }, diff --git a/downstreamadapter/dispatchermanager/heartbeat_collector.go b/downstreamadapter/dispatchermanager/heartbeat_collector.go index 67417face8..3b58a50645 100644 --- a/downstreamadapter/dispatchermanager/heartbeat_collector.go +++ b/downstreamadapter/dispatchermanager/heartbeat_collector.go @@ -263,12 +263,12 @@ func (c *HeartBeatCollector) RecvMessages(_ context.Context, msg *messaging.Targ heartbeatResponse := msg.Message[0].(*heartbeatpb.HeartBeatResponse) c.heartBeatResponseDynamicStream.Push( common.NewChangefeedGIDFromPB(heartbeatResponse.ChangefeedID), - NewHeartBeatResponse(heartbeatResponse)) + NewHeartBeatResponse(msg.From, heartbeatResponse)) case messaging.TypeScheduleDispatcherRequest: schedulerDispatcherRequest := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) c.schedulerDispatcherRequestDynamicStream.Push( common.NewChangefeedGIDFromPB(schedulerDispatcherRequest.ChangefeedID), - NewSchedulerDispatcherRequest(schedulerDispatcherRequest)) + NewSchedulerDispatcherRequest(msg.From, schedulerDispatcherRequest)) // TODO: check metrics metrics.HandleDispatcherRequsetCounter.WithLabelValues("default", schedulerDispatcherRequest.ChangefeedID.Name, "receive").Inc() case messaging.TypeCheckpointTsMessage: @@ -290,7 +290,7 @@ func (c *HeartBeatCollector) RecvMessages(_ context.Context, msg *messaging.Targ mergeDispatcherRequest := msg.Message[0].(*heartbeatpb.MergeDispatcherRequest) c.mergeDispatcherRequestDynamicStream.Push( common.NewChangefeedGIDFromPB(mergeDispatcherRequest.ChangefeedID), - NewMergeDispatcherRequest(mergeDispatcherRequest)) + NewMergeDispatcherRequest(msg.From, mergeDispatcherRequest)) default: log.Warn("unknown message type, ignore it", zap.String("type", msg.Type.String()), diff --git a/downstreamadapter/dispatchermanager/helper.go b/downstreamadapter/dispatchermanager/helper.go index 52d7516dbd..843300998b 100644 --- a/downstreamadapter/dispatchermanager/helper.go +++ b/downstreamadapter/dispatchermanager/helper.go @@ -25,6 +25,7 @@ import ( "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/node" "github.com/pingcap/ticdc/pkg/util" "github.com/pingcap/ticdc/utils/dynstream" "go.uber.org/zap" @@ -41,22 +42,22 @@ type DispatcherMap[T dispatcher.Dispatcher] struct { // When some new dispatcher(table) is being added, the maintainer will block the forward of changefeed's checkpointTs // until the maintainer receive the message that the new dispatcher's component status change to working. // - // Besides, there is no strict order of the heartbeat message and the table status messages, which is means - // it can happen that when dispatcher A is created, event dispatcher manager may first send a table status message - // to show the new dispatcher is working, and then send a heartbeat message of the current watermark, - // which is calculated without the new disaptcher. - // When the checkpointTs of the watermark is large than the startTs of the new dispatcher, - // the watermark of next heartbeat, which calculated with the new dispatcher can be less than the previous watermark. - // Then it can cause the fallback of changefeed's checkpointTs. - // To avoid fallback, we add a seq number in each heartbeat message(both collect from collectComponentStatusWhenChanged and aggregateDispatcherHeartbeats) - // When a table is added the seq number will be increase, - // and when the maintainer receive the outdate seq, it will know the heartbeat message is outdate and ignore it. + // Besides, heartbeat messages and table status messages have no strict order. + // After dispatcher A is created, the event dispatcher manager may first send + // a table status message showing the new dispatcher is working, and then send + // a heartbeat with the current watermark calculated without the new dispatcher. + // If that watermark checkpointTs is larger than the new dispatcher's startTs, + // the next heartbeat calculated with the new dispatcher can be lower and cause + // the changefeed checkpointTs to fall back. + // To avoid fallback, each heartbeat carries a seq number collected from + // collectComponentStatusWhenChanged and aggregateDispatcherHeartbeats. When a + // table is added, the seq number increases, so the maintainer can ignore + // outdated heartbeat messages. // In this way, even the above case happens, the changefeed's checkpointTs will not fallback. // - // Here we don't need to make seq changes always atmoic with the m changed. - // Our target is just : - // The seq get from ForEach should be smaller than the seq get from Set - // when ForEach is not access the new dispatcher just Set. + // Here we don't need to make seq changes always atomic with the map changes. + // Our target is only that the seq from ForEach is smaller than the seq from + // Set when ForEach does not access the newly added dispatcher. // So we add seq after the dispatcher is add in the m for Set, and get the seq before do range for ForRange. seq atomic.Uint64 } @@ -178,11 +179,14 @@ func newSchedulerDispatcherRequestDynamicStream() dynstream.DynamicStream[int, c } type SchedulerDispatcherRequest struct { + From node.ID *heartbeatpb.ScheduleDispatcherRequest } -func NewSchedulerDispatcherRequest(req *heartbeatpb.ScheduleDispatcherRequest) SchedulerDispatcherRequest { - return SchedulerDispatcherRequest{req} +// NewSchedulerDispatcherRequest carries the sender node with the schedule +// request so dispatcher-manager admission can fence stale maintainers. +func NewSchedulerDispatcherRequest(from node.ID, req *heartbeatpb.ScheduleDispatcherRequest) SchedulerDispatcherRequest { + return SchedulerDispatcherRequest{From: from, ScheduleDispatcherRequest: req} } type SchedulerDispatcherRequestHandler struct{} @@ -204,10 +208,8 @@ func (h *SchedulerDispatcherRequestHandler) Path(scheduleDispatcherRequest Sched // Some requests are intentionally dropped (see preCheckForSchedulerHandler / handleScheduleRemove) to avoid // leaking operator entries in cases where we have no cleanup callback (e.g. remove a non-existent dispatcher). func (h *SchedulerDispatcherRequestHandler) Handle(dispatcherManager *DispatcherManager, reqs ...SchedulerDispatcherRequest) bool { - if len(reqs) == 0 { - // dynstream guarantees len(events)>0, but guard defensively to avoid panics if that contract changes. - return false - } + dispatcherManager.MaintainerFenceMu.Lock() + defer dispatcherManager.MaintainerFenceMu.Unlock() // `dynstream` guarantees per-path serialization: for a given changefeed (Path), // SchedulerDispatcherRequestHandler.Handle will not be executed concurrently. This matters for reasoning: @@ -219,14 +221,14 @@ func (h *SchedulerDispatcherRequestHandler) Handle(dispatcherManager *Dispatcher infos := map[common.DispatcherID]dispatcherCreateInfo{} redoInfos := map[common.DispatcherID]dispatcherCreateInfo{} for _, req := range reqs { - operatorKey, ok := preCheckForSchedulerHandler(req, dispatcherManager) + dispatcherID, ok := preCheckForSchedulerHandler(req, dispatcherManager) if !ok { continue } switch req.ScheduleAction { case heartbeatpb.ScheduleAction_Create: // Store the add operator and create an info for later create dispatcher. - handleScheduleCreate(dispatcherManager, req, operatorKey, infos, redoInfos) + handleScheduleCreate(dispatcherManager, req, dispatcherID, infos, redoInfos) case heartbeatpb.ScheduleAction_Remove: // Remove is non-batchable (see GetType), so reqs should contain exactly one request. if len(reqs) != 1 { @@ -234,7 +236,7 @@ func (h *SchedulerDispatcherRequestHandler) Handle(dispatcherManager *Dispatcher } // Store the remove operator (when applicable) and remove the dispatcher directly. // The remove operator will be deleted after the dispatcher is removed from dispatcherMap. - handleScheduleRemove(dispatcherManager, req, operatorKey) + handleScheduleRemove(dispatcherManager, req, dispatcherID) default: log.Panic("unknown schedule action", zap.Int("action", int(req.ScheduleAction))) } @@ -251,11 +253,10 @@ func (h *SchedulerDispatcherRequestHandler) Handle(dispatcherManager *Dispatcher // preCheckForSchedulerHandler validates a scheduling request and decides whether it should be applied. // -// It returns the stable key used in currentOperatorMap (dispatcherID), and a boolean indicating whether the -// request should proceed. The precheck filters out: +// It returns the dispatcherID used as currentOperatorMap key. The precheck filters out: // - invalid requests (nil request/config/dispatcherID), // - redo requests when redo is disabled, -// - duplicate Create requests for the same dispatcherID while another operator is in-flight, +// - stale maintainer requests and duplicate Create requests, // - Create requests for an already-existing dispatcher (idempotent no-op). // // Note: Remove requests are allowed to proceed even if the dispatcher doesn't exist (we still want to emit a @@ -270,32 +271,46 @@ func preCheckForSchedulerHandler(req SchedulerDispatcherRequest, dispatcherManag log.Warn("scheduleDispatcherRequest config is nil, skip") return common.DispatcherID{}, false } - operatorKey := common.NewDispatcherIDFromPB(req.Config.DispatcherID) - if operatorKey.IsZero() { + dispatcherID := common.NewDispatcherIDFromPB(req.Config.DispatcherID) + if dispatcherID.IsZero() { log.Warn("scheduleDispatcherRequest has no valid operator key, skip") return common.DispatcherID{}, false } - + if !dispatcherManager.IsMaintainerRequestAllowed(req.From, req.MaintainerEpoch) { + log.Warn("drop stale schedule dispatcher request", + zap.String("changefeedID", req.ChangefeedID.String()), + zap.String("dispatcherID", dispatcherID.String()), + zap.String("from", req.From.String()), + zap.Uint64("requestMaintainerEpoch", req.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", dispatcherManager.GetMaintainerEpoch()), + zap.String("currentMaintainer", dispatcherManager.GetMaintainerID().String())) + return common.DispatcherID{}, false + } isRedo := common.IsRedoMode(req.Config.Mode) if isRedo && !dispatcherManager.IsRedoReady() { return common.DispatcherID{}, false } - if _, operatorExists := dispatcherManager.currentOperatorMap.Load(operatorKey); operatorExists { - // Create requests must be serialized per dispatcherID; otherwise we can end up creating multiple - // dispatchers for the same span/dispatcherID. - if req.ScheduleAction == heartbeatpb.ScheduleAction_Create { - return common.DispatcherID{}, false + if existing, operatorExists := dispatcherManager.currentOperatorMap.Load(dispatcherID); operatorExists { + existingReq := existing.(SchedulerDispatcherRequest) + if !dispatcherManager.IsMaintainerRequestAllowed(existingReq.From, existingReq.MaintainerEpoch) { + dispatcherManager.currentOperatorMap.Delete(dispatcherID) + } else { + // Create requests must be serialized per dispatcherID; otherwise we can end up creating multiple + // dispatchers for the same span/dispatcherID. + if req.ScheduleAction == heartbeatpb.ScheduleAction_Create { + return common.DispatcherID{}, false + } + // Remove requests are allowed to proceed: removeDispatcher is idempotent and the incoming request + // may carry a newer OperatorType for maintainer bootstrap/failover reconstruction. } - // Remove requests are allowed to proceed: removeDispatcher is idempotent and the incoming request - // may carry a newer OperatorType for maintainer bootstrap/failover reconstruction. } // Check whether the dispatcher exists locally. This is used to treat Create as idempotent. var dispatcherExists bool if isRedo { - _, dispatcherExists = dispatcherManager.redoDispatcherMap.Get(operatorKey) + _, dispatcherExists = dispatcherManager.redoDispatcherMap.Get(dispatcherID) } else { - _, dispatcherExists = dispatcherManager.dispatcherMap.Get(operatorKey) + _, dispatcherExists = dispatcherManager.dispatcherMap.Get(dispatcherID) } // Action-aware precheck: @@ -309,27 +324,26 @@ func preCheckForSchedulerHandler(req SchedulerDispatcherRequest, dispatcherManag case heartbeatpb.ScheduleAction_Remove: } - return operatorKey, true + return dispatcherID, true } func handleScheduleCreate( dispatcherManager *DispatcherManager, req SchedulerDispatcherRequest, - operatorKey common.DispatcherID, + dispatcherID common.DispatcherID, infos map[common.DispatcherID]dispatcherCreateInfo, redoInfos map[common.DispatcherID]dispatcherCreateInfo, ) { config := req.Config - dispatcherID := common.NewDispatcherIDFromPB(config.DispatcherID) info := dispatcherCreateInfo{ - Id: dispatcherID, + ID: dispatcherID, TableSpan: config.Span, StartTs: config.StartTs, SchemaID: config.SchemaID, SkipDMLAsStartTs: config.SkipDMLAsStartTs, } if common.IsRedoMode(config.Mode) { - dispatcherManager.currentOperatorMap.Store(operatorKey, req) + dispatcherManager.currentOperatorMap.Store(dispatcherID, req) log.Debug("store current working add operator for redo dispatcher", zap.String("changefeedID", req.ChangefeedID.String()), zap.String("dispatcherID", dispatcherID.String()), @@ -337,7 +351,7 @@ func handleScheduleCreate( ) redoInfos[dispatcherID] = info } else { - dispatcherManager.currentOperatorMap.Store(operatorKey, req) + dispatcherManager.currentOperatorMap.Store(dispatcherID, req) log.Debug("store current working add operator", zap.String("changefeedID", req.ChangefeedID.String()), zap.String("dispatcherID", dispatcherID.String()), @@ -350,10 +364,9 @@ func handleScheduleCreate( func handleScheduleRemove( dispatcherManager *DispatcherManager, req SchedulerDispatcherRequest, - operatorKey common.DispatcherID, + dispatcherID common.DispatcherID, ) { config := req.Config - dispatcherID := common.NewDispatcherIDFromPB(config.DispatcherID) if common.IsRedoMode(config.Mode) { // If redo is disabled or the dispatcher does not exist, do not store the remove operator. // Otherwise, the operator may never be cleaned up because cleanRedoDispatcher won't be called. @@ -361,7 +374,7 @@ func handleScheduleRemove( return } if _, exists := dispatcherManager.redoDispatcherMap.Get(dispatcherID); exists { - dispatcherManager.currentOperatorMap.Store(operatorKey, req) + dispatcherManager.currentOperatorMap.Store(dispatcherID, req) log.Debug("store current working remove operator for redo dispatcher", zap.String("changefeedID", req.ChangefeedID.String()), zap.String("dispatcherID", dispatcherID.String()), @@ -381,7 +394,7 @@ func handleScheduleRemove( // If the dispatcher does not exist, do not store the remove operator. // Otherwise, the operator may never be cleaned up because cleanEventDispatcher won't be called. if _, exists := dispatcherManager.dispatcherMap.Get(dispatcherID); exists { - dispatcherManager.currentOperatorMap.Store(operatorKey, req) + dispatcherManager.currentOperatorMap.Store(dispatcherID, req) log.Debug("store current working remove operator", zap.String("changefeedID", req.ChangefeedID.String()), zap.String("dispatcherID", dispatcherID.String()), @@ -444,21 +457,21 @@ func deleteCreatedOperators[T dispatcher.Dispatcher]( dispatcherKind string, ) { for _, info := range infos { - if _, exists := dispatcherMap.Get(info.Id); !exists { + if _, exists := dispatcherMap.Get(info.ID); !exists { continue } // Create requests are stored in currentOperatorMap before creation and // should be deleted only after the dispatcher is actually created. - if v, ok := dispatcherManager.currentOperatorMap.Load(info.Id); ok { + if v, ok := dispatcherManager.currentOperatorMap.Load(info.ID); ok { req := v.(SchedulerDispatcherRequest) if req.ScheduleAction == heartbeatpb.ScheduleAction_Create { log.Debug("delete current working add operator", zap.String("changefeedID", dispatcherManager.changefeedID.String()), - zap.String("dispatcherID", info.Id.String()), + zap.String("dispatcherID", info.ID.String()), zap.String("dispatcherKind", dispatcherKind), zap.Any("operator", req), ) - dispatcherManager.currentOperatorMap.Delete(info.Id) + dispatcherManager.currentOperatorMap.Delete(info.ID) } } } @@ -506,11 +519,14 @@ func newHeartBeatResponseDynamicStream(dds dynstream.DynamicStream[common.GID, c } type HeartBeatResponse struct { + From node.ID *heartbeatpb.HeartBeatResponse } -func NewHeartBeatResponse(resp *heartbeatpb.HeartBeatResponse) HeartBeatResponse { - return HeartBeatResponse{resp} +// NewHeartBeatResponse carries the sender node with the heartbeat so stale +// maintainer responses cannot update dispatcher state. +func NewHeartBeatResponse(from node.ID, resp *heartbeatpb.HeartBeatResponse) HeartBeatResponse { + return HeartBeatResponse{From: from, HeartBeatResponse: resp} } type HeartBeatResponseHandler struct { @@ -531,6 +547,11 @@ func (h *HeartBeatResponseHandler) Handle(dispatcherManager *DispatcherManager, panic("invalid response count") } heartbeatResponse := resps[0] + dispatcherManager.MaintainerFenceMu.Lock() + defer dispatcherManager.MaintainerFenceMu.Unlock() + if !isHeartBeatResponseAllowed(dispatcherManager, heartbeatResponse) { + return false + } dispatcherStatuses := heartbeatResponse.GetDispatcherStatuses() for _, dispatcherStatus := range dispatcherStatuses { influencedDispatchersType := dispatcherStatus.InfluencedDispatchers.InfluenceType @@ -568,6 +589,21 @@ func (h *HeartBeatResponseHandler) Handle(dispatcherManager *DispatcherManager, return false } +// isHeartBeatResponseAllowed drops dispatcher heartbeats from stale maintainers +// before they can update table state or complete scheduler operators. +func isHeartBeatResponseAllowed(dispatcherManager *DispatcherManager, heartbeatResponse HeartBeatResponse) bool { + if dispatcherManager.IsMaintainerRequestAllowed(heartbeatResponse.From, heartbeatResponse.MaintainerEpoch) { + return true + } + log.Warn("drop stale heartbeat response", + zap.String("changefeedID", heartbeatResponse.ChangefeedID.String()), + zap.String("from", heartbeatResponse.From.String()), + zap.Uint64("responseMaintainerEpoch", heartbeatResponse.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", dispatcherManager.GetMaintainerEpoch()), + zap.String("currentMaintainer", dispatcherManager.GetMaintainerID().String())) + return false +} + func (h *HeartBeatResponseHandler) GetSize(event HeartBeatResponse) int { return 0 } func (h *HeartBeatResponseHandler) IsPaused(event HeartBeatResponse) bool { return false } func (h *HeartBeatResponseHandler) GetArea(_ common.GID, _ *DispatcherManager) int { @@ -783,11 +819,14 @@ func newMergeDispatcherRequestDynamicStream() dynstream.DynamicStream[int, commo } type MergeDispatcherRequest struct { + From node.ID *heartbeatpb.MergeDispatcherRequest } -func NewMergeDispatcherRequest(req *heartbeatpb.MergeDispatcherRequest) MergeDispatcherRequest { - return MergeDispatcherRequest{req} +// NewMergeDispatcherRequest carries the sender node together with the request +// so the handler can apply the dispatcher-manager maintainer fence. +func NewMergeDispatcherRequest(from node.ID, req *heartbeatpb.MergeDispatcherRequest) MergeDispatcherRequest { + return MergeDispatcherRequest{From: from, MergeDispatcherRequest: req} } type MergeDispatcherRequestHandler struct{} @@ -802,6 +841,17 @@ func (h *MergeDispatcherRequestHandler) Handle(dispatcherManager *DispatcherMana } mergeDispatcherRequest := reqs[0] + dispatcherManager.MaintainerFenceMu.Lock() + defer dispatcherManager.MaintainerFenceMu.Unlock() + if !dispatcherManager.IsMaintainerRequestAllowed(mergeDispatcherRequest.From, mergeDispatcherRequest.MaintainerEpoch) { + log.Warn("drop stale merge dispatcher request", + zap.String("changefeedID", mergeDispatcherRequest.ChangefeedID.String()), + zap.String("from", mergeDispatcherRequest.From.String()), + zap.Uint64("requestMaintainerEpoch", mergeDispatcherRequest.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", dispatcherManager.GetMaintainerEpoch()), + zap.String("currentMaintainer", dispatcherManager.GetMaintainerID().String())) + return false + } dispatcherIDs := make([]common.DispatcherID, 0, len(mergeDispatcherRequest.DispatcherIDs)) for _, id := range mergeDispatcherRequest.DispatcherIDs { dispatcherIDs = append(dispatcherIDs, common.NewDispatcherIDFromPB(id)) diff --git a/downstreamadapter/dispatchermanager/helper_test.go b/downstreamadapter/dispatchermanager/helper_test.go index 15cb2ea0a6..e23f1f81a9 100644 --- a/downstreamadapter/dispatchermanager/helper_test.go +++ b/downstreamadapter/dispatchermanager/helper_test.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/ticdc/downstreamadapter/sink/redo" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/pkg/common" + "github.com/pingcap/ticdc/pkg/node" "github.com/stretchr/testify/require" ) @@ -160,7 +161,7 @@ func TestPreCheckForSchedulerHandler_RemoveAllowedWhenDispatcherMissing(t *testi dispatcherMap: newDispatcherMap[*dispatcher.EventDispatcher](), } - removeReq := NewSchedulerDispatcherRequest(&heartbeatpb.ScheduleDispatcherRequest{ + removeReq := NewSchedulerDispatcherRequest("node1", &heartbeatpb.ScheduleDispatcherRequest{ ChangefeedID: &heartbeatpb.ChangefeedID{Keyspace: "test-namespace", Name: "test-changefeed"}, Config: &heartbeatpb.DispatcherConfig{ DispatcherID: dispatcherID.ToPB(), @@ -190,7 +191,7 @@ func TestPreCheckForSchedulerHandler_CreateSkippedWhenDispatcherExists(t *testin } dm.dispatcherMap.Set(dispatcherID, &dispatcher.EventDispatcher{}) - createReq := NewSchedulerDispatcherRequest(&heartbeatpb.ScheduleDispatcherRequest{ + createReq := NewSchedulerDispatcherRequest("node1", &heartbeatpb.ScheduleDispatcherRequest{ ChangefeedID: &heartbeatpb.ChangefeedID{Keyspace: "test-namespace", Name: "test-changefeed"}, Config: &heartbeatpb.DispatcherConfig{ DispatcherID: dispatcherID.ToPB(), @@ -204,6 +205,162 @@ func TestPreCheckForSchedulerHandler_CreateSkippedWhenDispatcherExists(t *testin require.False(t, ok) } +func TestPreCheckForSchedulerHandler_MaintainerEpochFence(t *testing.T) { + t.Parallel() + + dispatcherID := common.NewDispatcherID() + currentDM := &DispatcherManager{ + changefeedID: common.NewChangeFeedIDWithName("test-changefeed", "test-namespace"), + dispatcherMap: newDispatcherMap[*dispatcher.EventDispatcher](), + } + currentDM.meta.maintainerID = "current-maintainer" + currentDM.meta.maintainerEpoch = 2 + + newReq := func(epoch uint64) *heartbeatpb.ScheduleDispatcherRequest { + return &heartbeatpb.ScheduleDispatcherRequest{ + ChangefeedID: &heartbeatpb.ChangefeedID{Keyspace: "test-namespace", Name: "test-changefeed"}, + Config: &heartbeatpb.DispatcherConfig{ + DispatcherID: dispatcherID.ToPB(), + Mode: 0, + }, + ScheduleAction: heartbeatpb.ScheduleAction_Create, + OperatorType: heartbeatpb.OperatorType_O_Add, + MaintainerEpoch: epoch, + } + } + + _, ok := preCheckForSchedulerHandler(NewSchedulerDispatcherRequest("old-maintainer", newReq(1)), currentDM) + require.False(t, ok) + + operatorKey, ok := preCheckForSchedulerHandler(NewSchedulerDispatcherRequest("current-maintainer", newReq(2)), currentDM) + require.True(t, ok) + require.Equal(t, dispatcherID, operatorKey) + + _, ok = preCheckForSchedulerHandler(NewSchedulerDispatcherRequest("current-maintainer", newReq(0)), currentDM) + require.False(t, ok) + + compatDM := &DispatcherManager{ + changefeedID: currentDM.changefeedID, + dispatcherMap: newDispatcherMap[*dispatcher.EventDispatcher](), + } + compatDM.meta.maintainerID = "current-maintainer" + operatorKey, ok = preCheckForSchedulerHandler(NewSchedulerDispatcherRequest("current-maintainer", newReq(0)), compatDM) + require.True(t, ok) + require.Equal(t, dispatcherID, operatorKey) + + currentDM.currentOperatorMap.Store(dispatcherID, NewSchedulerDispatcherRequest("old-maintainer", newReq(1))) + operatorKey, ok = preCheckForSchedulerHandler(NewSchedulerDispatcherRequest("current-maintainer", newReq(2)), currentDM) + require.True(t, ok) + require.Equal(t, dispatcherID, operatorKey) + _, exists := currentDM.currentOperatorMap.Load(dispatcherID) + require.False(t, exists) +} + +func TestDispatcherManagerTryUpdateMaintainerEpoch(t *testing.T) { + t.Parallel() + + strictDM := &DispatcherManager{} + strictDM.meta.maintainerID = "current-maintainer" + strictDM.meta.maintainerEpoch = 2 + + require.False(t, strictDM.TryUpdateMaintainer("current-maintainer", 0)) + require.Equal(t, node.ID("current-maintainer"), strictDM.GetMaintainerID()) + require.Equal(t, uint64(2), strictDM.GetMaintainerEpoch()) + + compatDM := &DispatcherManager{} + compatDM.meta.maintainerID = "old-maintainer" + + require.True(t, compatDM.TryUpdateMaintainer("new-maintainer", 0)) + require.Equal(t, node.ID("new-maintainer"), compatDM.GetMaintainerID()) + require.Zero(t, compatDM.GetMaintainerEpoch()) +} + +func TestHeartBeatResponseAllowedByMaintainerEpoch(t *testing.T) { + t.Parallel() + + changefeedID := common.NewChangeFeedIDWithName("test-changefeed", "test-namespace") + strictDM := &DispatcherManager{ + changefeedID: changefeedID, + } + strictDM.meta.maintainerID = "current-maintainer" + strictDM.meta.maintainerEpoch = 2 + + require.False(t, isHeartBeatResponseAllowed(strictDM, NewHeartBeatResponse( + node.ID("old-maintainer"), + &heartbeatpb.HeartBeatResponse{ + ChangefeedID: changefeedID.ToPB(), + MaintainerEpoch: 1, + DispatcherStatuses: []*heartbeatpb.DispatcherStatus{ + { + InfluencedDispatchers: &heartbeatpb.InfluencedDispatchers{ + InfluenceType: heartbeatpb.InfluenceType_All, + }, + Action: &heartbeatpb.DispatcherAction{Action: heartbeatpb.Action_Pass}, + }, + }, + }, + ))) + require.False(t, isHeartBeatResponseAllowed(strictDM, NewHeartBeatResponse( + node.ID("current-maintainer"), + &heartbeatpb.HeartBeatResponse{ + ChangefeedID: changefeedID.ToPB(), + MaintainerEpoch: 0, + }, + ))) + require.True(t, isHeartBeatResponseAllowed(strictDM, NewHeartBeatResponse( + node.ID("current-maintainer"), + &heartbeatpb.HeartBeatResponse{ + ChangefeedID: changefeedID.ToPB(), + MaintainerEpoch: 2, + }, + ))) + + compatDM := &DispatcherManager{ + changefeedID: changefeedID, + } + compatDM.meta.maintainerID = "compat-maintainer" + require.True(t, isHeartBeatResponseAllowed(compatDM, NewHeartBeatResponse( + node.ID("compat-maintainer"), + &heartbeatpb.HeartBeatResponse{ + ChangefeedID: changefeedID.ToPB(), + MaintainerEpoch: 0, + }, + ))) +} + +func TestHeartBeatResponseHandlerDropsStaleMaintainerEpoch(t *testing.T) { + t.Parallel() + + changefeedID := common.NewChangeFeedIDWithName("test-changefeed", "test-namespace") + dispatcherID := common.NewDispatcherID() + dispatcherManager := &DispatcherManager{ + changefeedID: changefeedID, + } + dispatcherManager.meta.maintainerID = "current-maintainer" + dispatcherManager.meta.maintainerEpoch = 2 + + handler := &HeartBeatResponseHandler{} + staleResponse := NewHeartBeatResponse( + node.ID("old-maintainer"), + &heartbeatpb.HeartBeatResponse{ + ChangefeedID: changefeedID.ToPB(), + MaintainerEpoch: 1, + DispatcherStatuses: []*heartbeatpb.DispatcherStatus{ + { + InfluencedDispatchers: &heartbeatpb.InfluencedDispatchers{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + DispatcherIDs: []*heartbeatpb.DispatcherID{dispatcherID.ToPB()}, + }, + Action: &heartbeatpb.DispatcherAction{Action: heartbeatpb.Action_Pass}, + }, + }, + }) + + require.NotPanics(t, func() { + require.False(t, handler.Handle(dispatcherManager, staleResponse)) + }) +} + func TestDispatcherManagerIsRedoReadyRequiresPublication(t *testing.T) { t.Parallel() diff --git a/downstreamadapter/dispatcherorchestrator/dispatcher_orchestrator.go b/downstreamadapter/dispatcherorchestrator/dispatcher_orchestrator.go index 04c41524ad..bd5afde268 100644 --- a/downstreamadapter/dispatcherorchestrator/dispatcher_orchestrator.go +++ b/downstreamadapter/dispatcherorchestrator/dispatcher_orchestrator.go @@ -20,6 +20,7 @@ import ( "sync/atomic" "time" + "github.com/gogo/protobuf/proto" "github.com/pingcap/log" "github.com/pingcap/ticdc/downstreamadapter/dispatcher" "github.com/pingcap/ticdc/downstreamadapter/dispatchermanager" @@ -39,11 +40,15 @@ import ( // for different change feeds based on maintainer bootstrap messages. type DispatcherOrchestrator struct { mc messaging.MessageCenter - mutex sync.Mutex // protect dispatcherManagers + mutex sync.Mutex // protect dispatcherManagers and closedMaintainerEpochs dispatcherManagers map[common.ChangeFeedID]*dispatchermanager.DispatcherManager // initializingDispatcherManagers tracks managers that have been allocated // but are not yet visible in dispatcherManagers. initializingDispatcherManagers map[common.ChangeFeedID]*dispatchermanager.DispatcherManager + // closedMaintainerEpochs remembers the highest epoch that closed a manager. + // Map presence is meaningful because epoch 0 is a valid compatibility epoch. + // The tombstone prevents a delayed old bootstrap from recreating the manager after close. + closedMaintainerEpochs map[common.ChangeFeedID]uint64 // shards partition changefeed control messages by changefeed ID. Each shard keeps // the existing FIFO queue semantics, while different shards can process messages @@ -71,6 +76,7 @@ func New() *DispatcherOrchestrator { mc: appcontext.GetService[messaging.MessageCenter](appcontext.MessageCenter), dispatcherManagers: make(map[common.ChangeFeedID]*dispatchermanager.DispatcherManager), initializingDispatcherManagers: make(map[common.ChangeFeedID]*dispatchermanager.DispatcherManager), + closedMaintainerEpochs: make(map[common.ChangeFeedID]uint64), shards: make([]*orchestratorShard, dispatcherOrchestratorShardCount), } for i := range m.shards { @@ -181,22 +187,31 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( return nil } cfId := common.NewChangefeedIDFromPB(req.ChangefeedID) - - cfConfig := &config.ChangefeedConfig{} - if err := json.Unmarshal(req.Config, cfConfig); err != nil { - log.Panic("failed to unmarshal changefeed config", - zap.String("changefeedID", cfId.Name()), zap.Any("data", req.Config), zap.Error(err)) - } + maintainerEpoch := req.MaintainerEpoch // Keep the map lock scoped to dispatcherManagers lookups and updates only. // NewDispatcherManager may perform expensive downstream initialization, so it // must run outside the mutex to let unrelated shards progress concurrently. m.mutex.Lock() manager, exists := m.dispatcherManagers[cfId] + closedEpoch, closed := m.closedMaintainerEpochs[cfId] m.mutex.Unlock() var err error if !exists { + if closed && (maintainerEpoch == 0 || maintainerEpoch <= closedEpoch) { + log.Warn("drop stale maintainer bootstrap request after close", + zap.String("changefeed", cfId.Name()), + zap.String("from", from.String()), + zap.Uint64("requestMaintainerEpoch", maintainerEpoch), + zap.Uint64("closedMaintainerEpoch", closedEpoch)) + return nil + } + cfConfig := &config.ChangefeedConfig{} + if err := json.Unmarshal(req.Config, cfConfig); err != nil { + log.Panic("failed to unmarshal changefeed config", + zap.String("changefeedID", cfId.Name()), zap.Any("data", req.Config), zap.Error(err)) + } start := time.Now() var initializingManager *dispatchermanager.DispatcherManager manager, err = dispatchermanager.NewDispatcherManager( @@ -207,6 +222,7 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( req.TableTriggerRedoDispatcherId, req.StartTs, from, + maintainerEpoch, req.IsNewChangefeed, func(manager *dispatchermanager.DispatcherManager) bool { initializingManager = manager @@ -229,7 +245,8 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( appcontext.GetService[*dispatchermanager.HeartBeatCollector](appcontext.HeartbeatCollector).RemoveDispatcherManager(cfId) response := &heartbeatpb.MaintainerBootstrapResponse{ - ChangefeedID: req.ChangefeedID, + ChangefeedID: req.ChangefeedID, + MaintainerEpoch: maintainerEpoch, Err: &heartbeatpb.RunningError{ Time: time.Now().String(), Node: from.String(), @@ -249,12 +266,23 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( return nil } m.dispatcherManagers[cfId] = manager + delete(m.closedMaintainerEpochs, cfId) m.mutex.Unlock() metrics.DispatcherManagerGauge.WithLabelValues(cfId.Keyspace(), cfId.Name()).Inc() - } else { - if m.fenced.Load() { - return nil - } + } + + manager.MaintainerFenceMu.Lock() + if !manager.TryUpdateMaintainer(from, maintainerEpoch) { + log.Warn("drop stale maintainer bootstrap request", + zap.String("changefeed", cfId.Name()), + zap.String("from", from.String()), + zap.Uint64("requestMaintainerEpoch", maintainerEpoch), + zap.Uint64("currentMaintainerEpoch", manager.GetMaintainerEpoch()), + zap.String("currentMaintainer", manager.GetMaintainerID().String())) + manager.MaintainerFenceMu.Unlock() + return nil + } + if exists { // Check and potentially add a table trigger event dispatcher. // This is necessary during maintainer node migration, as the existing // dispatcher manager on the new node may not have a table trigger @@ -275,7 +303,8 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( } log.Error("failed to create new table trigger event dispatcher", zap.Stringer("changefeedID", cfId), zap.Error(err)) - return m.handleDispatcherError(from, req.ChangefeedID, err) + manager.MaintainerFenceMu.Unlock() + return m.handleDispatcherError(from, req.ChangefeedID, maintainerEpoch, err) } } } @@ -295,26 +324,15 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( } log.Error("failed to create new table trigger redo dispatcher", zap.Stringer("changefeedID", cfId), zap.Error(err)) - return m.handleDispatcherError(from, req.ChangefeedID, err) + manager.MaintainerFenceMu.Unlock() + return m.handleDispatcherError(from, req.ChangefeedID, maintainerEpoch, err) } } } } - if manager.GetMaintainerID() != from { - manager.SetMaintainerID(from) - log.Info("maintainer changed", - zap.String("changefeed", cfId.Name()), zap.String("maintainer", from.String())) - } - - // FIXME(fizz): This is a temporary check to ensure the maintainer epoch is consistent. - // I will remove this after fully testing the new maintainer epoch mechanism. - if manager.GetMaintainerEpoch() != cfConfig.Epoch { - log.Error("maintainer epoch changed, this should not happen, please report this issue", - zap.String("changefeed", cfId.Name()), zap.Uint64("epoch", cfConfig.Epoch)) - } - if m.fenced.Load() { + manager.MaintainerFenceMu.Unlock() manager.LocalFence() return nil } @@ -332,6 +350,7 @@ func (m *DispatcherOrchestrator) handleBootstrapRequest( } } response := createBootstrapResponse(req.ChangefeedID, manager, startTs, redoStartTs) + manager.MaintainerFenceMu.Unlock() return m.sendResponse(from, messaging.MaintainerManagerTopic, response) } @@ -357,6 +376,17 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( zap.Any("changefeedID", cfId.Name())) return nil } + manager.MaintainerFenceMu.Lock() + if !manager.IsMaintainerRequestAllowed(from, req.MaintainerEpoch) { + log.Warn("drop stale maintainer post bootstrap request", + zap.String("changefeed", cfId.Name()), + zap.String("from", from.String()), + zap.Uint64("requestMaintainerEpoch", req.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", manager.GetMaintainerEpoch()), + zap.String("currentMaintainer", manager.GetMaintainerID().String())) + manager.MaintainerFenceMu.Unlock() + return nil + } if manager.GetTableTriggerEventDispatcher().GetId() != common.NewDispatcherIDFromPB(req.TableTriggerEventDispatcherId) { log.Error("Receive post bootstrap request but the table trigger event dispatcher id is not match", @@ -370,7 +400,8 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( GenWithStackByArgs("Receive post bootstrap request but the table trigger event dispatcher id is not match") response := &heartbeatpb.MaintainerPostBootstrapResponse{ - ChangefeedID: req.ChangefeedID, + ChangefeedID: req.ChangefeedID, + MaintainerEpoch: req.MaintainerEpoch, Err: &heartbeatpb.RunningError{ Time: time.Now().String(), Node: from.String(), @@ -379,6 +410,7 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( }, } + manager.MaintainerFenceMu.Unlock() return m.sendResponse(from, messaging.MaintainerManagerTopic, response) } @@ -392,7 +424,8 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( } log.Error("failed to initialize table trigger event dispatcher", zap.Any("changefeedID", cfId.Name()), zap.Error(err)) - return m.handleDispatcherError(from, req.ChangefeedID, err) + manager.MaintainerFenceMu.Unlock() + return m.handleDispatcherError(from, req.ChangefeedID, req.MaintainerEpoch, err) } if manager.IsRedoReady() { err := manager.InitalizeTableTriggerRedoDispatcher(req.RedoSchemas) @@ -404,11 +437,13 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( } log.Error("failed to initialize table trigger redo dispatcher", zap.Any("changefeedID", cfId.Name()), zap.Error(err)) - return m.handleDispatcherError(from, req.ChangefeedID, err) + manager.MaintainerFenceMu.Unlock() + return m.handleDispatcherError(from, req.ChangefeedID, req.MaintainerEpoch, err) } } if m.fenced.Load() { + manager.MaintainerFenceMu.Unlock() manager.LocalFence() return nil } @@ -416,7 +451,9 @@ func (m *DispatcherOrchestrator) handlePostBootstrapRequest( response := &heartbeatpb.MaintainerPostBootstrapResponse{ ChangefeedID: req.ChangefeedID, TableTriggerEventDispatcherId: req.TableTriggerEventDispatcherId, + MaintainerEpoch: req.MaintainerEpoch, } + manager.MaintainerFenceMu.Unlock() return m.sendResponse(from, messaging.MaintainerManagerTopic, response) } @@ -426,21 +463,50 @@ func (m *DispatcherOrchestrator) handleCloseRequest( ) error { cfId := common.NewChangefeedIDFromPB(req.ChangefeedID) response := &heartbeatpb.MaintainerCloseResponse{ - ChangefeedID: req.ChangefeedID, - Success: true, + ChangefeedID: req.ChangefeedID, + Success: true, + MaintainerEpoch: req.MaintainerEpoch, } m.mutex.Lock() - if manager, ok := m.dispatcherManagers[cfId]; ok { - if closed := manager.TryClose(req.Removed); closed { - delete(m.dispatcherManagers, cfId) - metrics.DispatcherManagerGauge.WithLabelValues(cfId.Keyspace(), cfId.Name()).Dec() - response.Success = true + manager, ok := m.dispatcherManagers[cfId] + if !ok { + m.recordClosedMaintainerEpochLocked(cfId, req.MaintainerEpoch, req.Removed) + } + m.mutex.Unlock() + + if ok { + // Do not hold the orchestrator-wide map lock while waiting for the + // per-changefeed fence; a slow manager must not block unrelated changefeeds. + decGauge := false + manager.MaintainerFenceMu.Lock() + if manager.IsMaintainerRequestAllowed(from, req.MaintainerEpoch) { + if closed := manager.TryClose(req.Removed); closed { + m.mutex.Lock() + delete(m.dispatcherManagers, cfId) + m.recordClosedMaintainerEpochLocked(cfId, req.MaintainerEpoch, req.Removed) + m.mutex.Unlock() + decGauge = true + response.Success = true + } else { + response.Success = false + } } else { - response.Success = false + // The active manager belongs to a newer maintainer. Do not close it, but + // acknowledge the stale sender so removal-only maintainers can stop retrying. + response.Success = true + log.Warn("drop stale maintainer close request", + zap.String("changefeed", cfId.Name()), + zap.String("from", from.String()), + zap.Uint64("requestMaintainerEpoch", req.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", manager.GetMaintainerEpoch()), + zap.String("currentMaintainer", manager.GetMaintainerID().String())) + } + manager.MaintainerFenceMu.Unlock() + if decGauge { + metrics.DispatcherManagerGauge.WithLabelValues(cfId.Keyspace(), cfId.Name()).Dec() } } - m.mutex.Unlock() log.Info("try close dispatcher manager", zap.String("changefeed", cfId.String()), zap.Bool("success", response.Success)) @@ -511,14 +577,30 @@ func (m *DispatcherOrchestrator) removeInitializingDispatcherManager( } } +// recordClosedMaintainerEpochLocked remembers closed maintainer generations so +// later bootstrap requests from older owners cannot recreate a closed manager. +func (m *DispatcherOrchestrator) recordClosedMaintainerEpochLocked(cfID common.ChangeFeedID, maintainerEpoch uint64, removed bool) { + if maintainerEpoch == 0 && !removed { + // Epoch 0 has no ordering information. Keep permanent tombstones only + // for removal so mixed-version resume can still bootstrap in compat mode. + return + } + closedEpoch, ok := m.closedMaintainerEpochs[cfID] + if ok && closedEpoch >= maintainerEpoch { + return + } + m.closedMaintainerEpochs[cfID] = maintainerEpoch +} + func createBootstrapResponse( changefeedID *heartbeatpb.ChangefeedID, manager *dispatchermanager.DispatcherManager, startTs, redoStartTs uint64, ) *heartbeatpb.MaintainerBootstrapResponse { response := &heartbeatpb.MaintainerBootstrapResponse{ - ChangefeedID: changefeedID, - Spans: make([]*heartbeatpb.BootstrapTableSpan, 0, manager.GetDispatcherMap().Len()), + ChangefeedID: changefeedID, + Spans: make([]*heartbeatpb.BootstrapTableSpan, 0, manager.GetDispatcherMap().Len()), + MaintainerEpoch: manager.GetMaintainerEpoch(), } // table trigger event dispatcher startTs @@ -582,10 +664,12 @@ func (m *DispatcherOrchestrator) Close() { func (m *DispatcherOrchestrator) handleDispatcherError( from node.ID, changefeedID *heartbeatpb.ChangefeedID, + maintainerEpoch uint64, err error, ) error { response := &heartbeatpb.MaintainerBootstrapResponse{ - ChangefeedID: changefeedID, + ChangefeedID: changefeedID, + MaintainerEpoch: maintainerEpoch, Err: &heartbeatpb.RunningError{ Time: time.Now().String(), Node: from.String(), @@ -638,7 +722,7 @@ func retrieveOperatorsForBootstrapResponse( manager *dispatchermanager.DispatcherManager, response *heartbeatpb.MaintainerBootstrapResponse, ) { - manager.GetCurrentOperatorMap().Range(func(key, value any) bool { + manager.GetCurrentOperatorMap().Range(func(_, value any) bool { req := value.(dispatchermanager.SchedulerDispatcherRequest) dispatcherID := common.NewDispatcherIDFromPB(req.Config.DispatcherID) if common.IsRedoMode(req.Config.Mode) { @@ -666,12 +750,8 @@ func retrieveOperatorsForBootstrapResponse( ) } } - response.Operators = append(response.Operators, &heartbeatpb.ScheduleDispatcherRequest{ - ChangefeedID: req.ChangefeedID, - Config: req.Config, - ScheduleAction: req.ScheduleAction, - OperatorType: req.OperatorType, - }) + response.Operators = append(response.Operators, + proto.Clone(req.ScheduleDispatcherRequest).(*heartbeatpb.ScheduleDispatcherRequest)) return true }) } diff --git a/downstreamadapter/dispatcherorchestrator/helper.go b/downstreamadapter/dispatcherorchestrator/helper.go index e036ced60a..31aba845f5 100644 --- a/downstreamadapter/dispatcherorchestrator/helper.go +++ b/downstreamadapter/dispatcherorchestrator/helper.go @@ -34,9 +34,8 @@ type pendingMessageKey struct { // Once Pop returns a message, the key leaves the pending set immediately, so the next // retry can queue one more request for the next processing round. // -// For MaintainerCloseRequest, we treat removed=true as stronger semantics than removed=false. -// While a request is still queued, a later removed=true request replaces removed=false in -// that queued slot so the next execution still observes the stronger semantics. +// For MaintainerCloseRequest, removed=true has stronger semantics than removed=false. +// It can upgrade a queued request only when it does not move the maintainer epoch backward. type pendingMessageQueue struct { mu sync.Mutex pending map[pendingMessageKey]*messaging.TargetMessage @@ -72,6 +71,9 @@ func (q *pendingMessageQueue) TryEnqueue(key pendingMessageKey, msg *messaging.T } func shouldReplacePendingMessage(key pendingMessageKey, oldMsg, newMsg *messaging.TargetMessage) bool { + if shouldReplaceByMaintainerEpoch(oldMsg, newMsg) { + return true + } if key.msgType != messaging.TypeMaintainerCloseRequest { return false } @@ -86,8 +88,35 @@ func shouldReplacePendingMessage(key pendingMessageKey, oldMsg, newMsg *messagin if !ok1 || !ok2 { return false } - // Only upgrade semantics: allow removed=true to override removed=false. - return !oldReq.Removed && newReq.Removed + // Only upgrade semantics: allow removed=true to override removed=false without + // letting a stale removed request overwrite a newer epoch close. + return !oldReq.Removed && newReq.Removed && newReq.MaintainerEpoch >= oldReq.MaintainerEpoch +} + +// shouldReplaceByMaintainerEpoch lets a newer maintainer generation replace an +// older queued control message for the same changefeed and message type. +func shouldReplaceByMaintainerEpoch(oldMsg, newMsg *messaging.TargetMessage) bool { + oldMaintainerEpoch, oldOK := pendingMessageMaintainerEpoch(oldMsg) + newMaintainerEpoch, newOK := pendingMessageMaintainerEpoch(newMsg) + return oldOK && newOK && newMaintainerEpoch > oldMaintainerEpoch +} + +// pendingMessageMaintainerEpoch extracts the maintainer epoch from messages +// whose ordering must be fenced by maintainer generation. +func pendingMessageMaintainerEpoch(msg *messaging.TargetMessage) (uint64, bool) { + if msg == nil || len(msg.Message) == 0 { + return 0, false + } + switch req := msg.Message[0].(type) { + case *heartbeatpb.MaintainerBootstrapRequest: + return req.MaintainerEpoch, true + case *heartbeatpb.MaintainerPostBootstrapRequest: + return req.MaintainerEpoch, true + case *heartbeatpb.MaintainerCloseRequest: + return req.MaintainerEpoch, true + default: + return 0, false + } } // Pop blocks until a message is available or the queue is closed. diff --git a/maintainer/barrier.go b/maintainer/barrier.go index f4a35325e8..9948fca20e 100644 --- a/maintainer/barrier.go +++ b/maintainer/barrier.go @@ -174,6 +174,7 @@ func (b *Barrier) HandleStatus(from node.ID, ChangefeedID: request.ChangefeedID, DispatcherStatuses: dispatcherStatus, Mode: b.mode, + MaintainerEpoch: b.operatorController.MaintainerEpoch(), }) msgs := []*messaging.TargetMessage{msg} @@ -185,6 +186,7 @@ func (b *Barrier) HandleStatus(from node.ID, ChangefeedID: request.ChangefeedID, DispatcherStatuses: action, Mode: b.mode, + MaintainerEpoch: b.operatorController.MaintainerEpoch(), }) msgs = append(msgs, msg) } diff --git a/maintainer/barrier_event.go b/maintainer/barrier_event.go index 85f93c7fcd..a91dced148 100644 --- a/maintainer/barrier_event.go +++ b/maintainer/barrier_event.go @@ -802,7 +802,8 @@ func (be *BarrierEvent) newWriterActionMessage(capture node.ID, mode int64) *mes }, }, }, - Mode: mode, + Mode: mode, + MaintainerEpoch: be.operatorController.MaintainerEpoch(), }) return msg } @@ -827,7 +828,8 @@ func (be *BarrierEvent) newPassActionMessage(capture node.ID, mode int64) *messa InfluencedDispatchers: influenced, }, }, - Mode: mode, + Mode: mode, + MaintainerEpoch: be.operatorController.MaintainerEpoch(), }) } diff --git a/maintainer/barrier_test.go b/maintainer/barrier_test.go index d0f9a63c63..ee4ac4e783 100644 --- a/maintainer/barrier_test.go +++ b/maintainer/barrier_test.go @@ -1410,7 +1410,7 @@ func TestUpdateCheckpointTs(t *testing.T) { require.Equal(t, resp.DispatcherStatuses[1].Action.Action, heartbeatpb.Action_Write) require.False(t, resp.DispatcherStatuses[1].Action.IsSyncPoint) // the checkpoint ts is updated - scheduleMsg := ddlSpan.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + scheduleMsg := ddlSpan.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) require.Equal(t, uint64(9), scheduleMsg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest).Config.StartTs, false) require.NotEqual(t, uint64(0), scheduleMsg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest).Config.StartTs, false) } diff --git a/maintainer/maintainer.go b/maintainer/maintainer.go index 3e5bd75a2a..96d89a429b 100644 --- a/maintainer/maintainer.go +++ b/maintainer/maintainer.go @@ -205,7 +205,7 @@ func NewMaintainer(cfID common.ChangeFeedID, eventCh: chann.NewAutoDrainChann[*Event](), startCheckpointTs: checkpointTs, controller: NewController(cfID, checkpointTs, taskScheduler, - info.Config, ddlSpan, redoDDLSpan, conf.AddTableBatchSize, time.Duration(conf.CheckBalanceInterval), refresher, keyspaceMeta, enableRedo, conf.BalanceMoveBatchSize), + info.Config, ddlSpan, redoDDLSpan, conf.AddTableBatchSize, time.Duration(conf.CheckBalanceInterval), refresher, keyspaceMeta, enableRedo, conf.BalanceMoveBatchSize, info.Epoch), mc: mc, removed: atomic.NewBool(false), nodeManager: nodeManager, @@ -288,11 +288,13 @@ func NewMaintainerForRemove(cfID common.ChangeFeedID, selfNode *node.Info, taskScheduler threadpool.ThreadPool, keyspaceID uint32, + maintainerEpoch uint64, ) *Maintainer { unused := &config.ChangeFeedInfo{ ChangefeedID: cfID, SinkURI: "", Config: config.GetDefaultReplicaConfig(), + Epoch: maintainerEpoch, } m := NewMaintainer(cfID, conf, unused, selfNode, taskScheduler, 1, false, keyspaceID) m.cascadeRemoving.Store(true) @@ -385,12 +387,13 @@ func (m *Maintainer) GetMaintainerStatus() *heartbeatpb.MaintainerStatus { } status := &heartbeatpb.MaintainerStatus{ - ChangefeedID: m.changefeedID.ToPB(), - State: heartbeatpb.ComponentState(m.scheduleState.Load()), - CheckpointTs: m.controller.spanController.GetMaintainerCommittedCheckpointTs(), - Err: runningErrors, - BootstrapDone: m.initialized.Load(), - LastSyncedTs: m.getWatermark().LastSyncedTs, + ChangefeedID: m.changefeedID.ToPB(), + State: heartbeatpb.ComponentState(m.scheduleState.Load()), + CheckpointTs: m.controller.spanController.GetMaintainerCommittedCheckpointTs(), + Err: runningErrors, + BootstrapDone: m.initialized.Load(), + LastSyncedTs: m.getWatermark().LastSyncedTs, + MaintainerEpoch: m.currentMaintainerEpoch(), } drainTarget, drainEpoch := m.controller.getDispatcherDrainTarget() if !drainTarget.IsEmpty() && drainEpoch > 0 { @@ -492,6 +495,13 @@ func (m *Maintainer) cleanupMetrics() { metrics.TableCountGauge.DeleteLabelValues(keyspace, name, "redo") } +func (m *Maintainer) markRemoved() { + if !m.removed.CompareAndSwap(false, true) { + return + } + metrics.MaintainerGauge.WithLabelValues(m.changefeedID.Keyspace(), m.changefeedID.Name()).Dec() +} + func (m *Maintainer) onInit() bool { err := m.initialize() if err != nil { @@ -526,6 +536,14 @@ func (m *Maintainer) onMessage(msg *messaging.TargetMessage) { m.onMaintainerCloseResponse(msg.From, resp) case messaging.TypeRemoveMaintainerRequest: req := msg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) + if !m.isMaintainerEpochRequestAllowed(req.MaintainerEpoch) { + log.Warn("drop stale remove maintainer request", + zap.Stringer("changefeedID", m.changefeedID), + zap.Stringer("from", msg.From), + zap.Uint64("requestMaintainerEpoch", req.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", m.currentMaintainerEpoch())) + return + } m.onRemoveMaintainer(req.Cascade, req.Removed) case messaging.TypeCheckpointTsMessage: req := msg.Message[0].(*heartbeatpb.CheckpointTsMessage) @@ -562,9 +580,8 @@ func (m *Maintainer) onRemoveMaintainer(cascade, changefeedRemoved bool) { m.controller.EnterRemovingMode(allowedDispatcherIDs...) closed := m.tryCloseChangefeed() if closed { - m.removed.Store(true) + m.markRemoved() m.scheduleState.Store(int32(heartbeatpb.ComponentState_Stopped)) - metrics.MaintainerGauge.WithLabelValues(m.changefeedID.Keyspace(), m.changefeedID.Name()).Dec() log.Info("changefeed maintainer closed", zap.Stringer("changefeedID", m.changefeedID), zap.Uint64("checkpointTs", m.getWatermark().CheckpointTs), zap.Bool("removed", m.removed.Load())) } @@ -861,6 +878,13 @@ func (m *Maintainer) sendMessages(msgs []*messaging.TargetMessage) { } } +func (m *Maintainer) currentMaintainerEpoch() uint64 { + if m.info == nil { + return 0 + } + return m.info.Epoch +} + func (m *Maintainer) onHeartbeatRequest(msg *messaging.TargetMessage) { // ignore the heartbeat if the maintainer not bootstrapped if !m.initialized.Load() { @@ -957,6 +981,10 @@ func (m *Maintainer) onMaintainerBootstrapResponse(msg *messaging.TargetMessage) zap.Stringer("sourceNodeID", msg.From)) resp := msg.Message[0].(*heartbeatpb.MaintainerBootstrapResponse) + if !m.isMaintainerEpochResponseAllowed(resp.MaintainerEpoch) { + m.logDroppedMaintainerResponse("bootstrap", msg.From, resp.MaintainerEpoch) + return + } if resp.Err != nil { log.Warn("maintainer bootstrap failed", zap.Stringer("changefeedID", m.changefeedID), @@ -980,6 +1008,10 @@ func (m *Maintainer) onMaintainerPostBootstrapResponse(msg *messaging.TargetMess zap.Stringer("changefeedID", m.changefeedID), zap.Any("server", msg.From)) resp := msg.Message[0].(*heartbeatpb.MaintainerPostBootstrapResponse) + if !m.isMaintainerEpochResponseAllowed(resp.MaintainerEpoch) { + m.logDroppedMaintainerResponse("post-bootstrap", msg.From, resp.MaintainerEpoch) + return + } if resp.Err != nil { log.Warn("maintainer post bootstrap failed", zap.Stringer("changefeedID", m.changefeedID), @@ -991,6 +1023,36 @@ func (m *Maintainer) onMaintainerPostBootstrapResponse(msg *messaging.TargetMess m.postBootstrapMsg = nil } +// isMaintainerEpochResponseAllowed accepts current-generation responses while +// preserving epoch-0 compatibility during rolling upgrades. +func (m *Maintainer) isMaintainerEpochResponseAllowed(responseEpoch uint64) bool { + return common.MaintainerEpochMatches(responseEpoch, m.currentMaintainerEpoch()) +} + +// isMaintainerEpochRequestAllowed fences dispatcher-manager requests that can +// close or mutate local dispatcher state on behalf of a maintainer generation. +func (m *Maintainer) isMaintainerEpochRequestAllowed(requestEpoch uint64) bool { + currentEpoch := m.currentMaintainerEpoch() + if requestEpoch == 0 { + // Epoch 0 is only valid while this maintainer is still in compatibility + // mode. A strict maintainer must not accept an unfenced tombstone. + return currentEpoch == 0 + } + // A strict request can still control a compatibility maintainer during + // rolling upgrade, but strict maintainers require an exact epoch match. + return currentEpoch == 0 || requestEpoch == currentEpoch +} + +// logDroppedMaintainerResponse records responses rejected by maintainer epoch fencing. +func (m *Maintainer) logDroppedMaintainerResponse(responseType string, from node.ID, responseEpoch uint64) { + log.Warn("drop stale maintainer response", + zap.Stringer("changefeedID", m.changefeedID), + zap.String("responseType", responseType), + zap.Stringer("from", from), + zap.Uint64("responseMaintainerEpoch", responseEpoch), + zap.Uint64("currentMaintainerEpoch", m.currentMaintainerEpoch())) +} + // isMysqlCompatible returns true if the sinkURIStr is mysql compatible. func isMysqlCompatible(sinkURIStr string) (bool, error) { sinkURI, err := url.Parse(sinkURIStr) @@ -1060,6 +1122,20 @@ func (m *Maintainer) sendPostBootstrapRequest() { } func (m *Maintainer) onMaintainerCloseResponse(from node.ID, response *heartbeatpb.MaintainerCloseResponse) { + if !m.isMaintainerEpochResponseAllowed(response.MaintainerEpoch) { + m.logDroppedMaintainerResponse("close", from, response.MaintainerEpoch) + return + } + if !m.removing.Load() { + // Close responses only complete an active remove flow. A delayed compat + // response from a superseded maintainer can share this changefeed ID. + log.Warn("drop unexpected maintainer close response", + zap.Stringer("changefeedID", m.changefeedID), + zap.Stringer("from", from), + zap.Uint64("responseMaintainerEpoch", response.MaintainerEpoch), + zap.Uint64("currentMaintainerEpoch", m.currentMaintainerEpoch())) + return + } if response.Success { m.closedNodes[from] = struct{}{} m.onRemoveMaintainer(m.cascadeRemoving.Load(), m.changefeedRemoved.Load()) @@ -1115,8 +1191,9 @@ func (m *Maintainer) trySendMaintainerCloseRequestToAllNode() bool { n, messaging.DispatcherManagerManagerTopic, &heartbeatpb.MaintainerCloseRequest{ - ChangefeedID: m.changefeedID.ToPB(), - Removed: m.changefeedRemoved.Load(), + ChangefeedID: m.changefeedID.ToPB(), + Removed: m.changefeedRemoved.Load(), + MaintainerEpoch: m.currentMaintainerEpoch(), })) } } @@ -1177,6 +1254,7 @@ func (m *Maintainer) createBootstrapMessageFactory() bootstrap.NewBootstrapReque TableTriggerRedoDispatcherId: nil, IsNewChangefeed: false, KeyspaceId: m.info.KeyspaceID, + MaintainerEpoch: m.currentMaintainerEpoch(), } // only send dispatcher targetNodeID to dispatcher manager on the same node diff --git a/maintainer/maintainer_controller.go b/maintainer/maintainer_controller.go index ea5381f81f..1660a9ba2f 100644 --- a/maintainer/maintainer_controller.go +++ b/maintainer/maintainer_controller.go @@ -15,6 +15,7 @@ package maintainer import ( "sync" + "sync/atomic" "time" "github.com/pingcap/log" @@ -58,8 +59,9 @@ type Controller struct { splitter *split.Splitter - replicaConfig *config.ReplicaConfig - changefeedID common.ChangeFeedID + replicaConfig *config.ReplicaConfig + changefeedID common.ChangeFeedID + maintainerEpoch atomic.Uint64 taskPool threadpool.ThreadPool @@ -94,6 +96,7 @@ func NewController(changefeedID common.ChangeFeedID, keyspaceMeta common.KeyspaceMeta, enableRedo bool, balanceMoveBatchSize int, + maintainerEpoch uint64, ) *Controller { mc := appcontext.GetService[messaging.MessageCenter](appcontext.MessageCenter) @@ -160,6 +163,7 @@ func NewController(changefeedID common.ChangeFeedID, controller.drainState, balanceMoveBatchSize, ) + controller.SetMaintainerEpoch(maintainerEpoch) return controller } @@ -170,6 +174,20 @@ func (c *Controller) SetErrorReporter(reportError func(error)) { } } +// SetMaintainerEpoch propagates the changefeed epoch used to fence +// dispatcher-manager control requests from stale maintainers. +func (c *Controller) SetMaintainerEpoch(maintainerEpoch uint64) { + c.maintainerEpoch.Store(maintainerEpoch) + c.operatorController.SetMaintainerEpoch(maintainerEpoch) + if c.redoOperatorController != nil { + c.redoOperatorController.SetMaintainerEpoch(maintainerEpoch) + } +} + +func (c *Controller) currentMaintainerEpoch() uint64 { + return c.maintainerEpoch.Load() +} + // HandleStatus handle the status report from the node. func (c *Controller) HandleStatus(from node.ID, statusList []*heartbeatpb.TableSpanStatus) { c.handleStatus(from, statusList, true) @@ -217,7 +235,16 @@ func (c *Controller) handleStatus(from node.ID, statusList []*heartbeatpb.TableS zap.Any("status", status), zap.String("dispatcherID", dispatcherID.String())) // If the span is not found but status is Working, we need to remove it from dispatcher. - _ = c.messageCenter.SendCommand(replica.NewRemoveDispatcherMessage(from, c.changefeedID, status.ID, nil, status.Mode, heartbeatpb.OperatorType_O_Remove)) + msg := replica.NewRemoveDispatcherMessage( + from, + c.changefeedID, + status.ID, + nil, + status.Mode, + heartbeatpb.OperatorType_O_Remove, + c.currentMaintainerEpoch(), + ) + _ = c.messageCenter.SendCommand(msg) } continue } diff --git a/maintainer/maintainer_controller_bootstrap.go b/maintainer/maintainer_controller_bootstrap.go index 3df71f2911..96277ec281 100644 --- a/maintainer/maintainer_controller_bootstrap.go +++ b/maintainer/maintainer_controller_bootstrap.go @@ -153,6 +153,7 @@ func (c *Controller) FinishBootstrap( TableTriggerEventDispatcherId: c.spanController.GetDDLDispatcherID().ToPB(), Schemas: c.prepareSchemaInfoResponse(schemaInfos), RedoSchemas: c.prepareSchemaInfoResponse(redoSchemaInfos), + MaintainerEpoch: c.currentMaintainerEpoch(), }, nil } @@ -836,8 +837,14 @@ func (c *Controller) handleCurrentWorkingAdd( // 3. If the original operator is split, which is a remove + add + add..., // same as move, just finish the add part. case heartbeatpb.OperatorType_O_Add, heartbeatpb.OperatorType_O_Move, heartbeatpb.OperatorType_O_Split: - op := operator.NewAddDispatcherOperator(spanController, replicaSet, node, heartbeatpb.OperatorType_O_Add) operatorController := c.getOperatorController(req.Config.Mode) + op := operator.NewAddDispatcherOperator( + spanController, + replicaSet, + node, + heartbeatpb.OperatorType_O_Add, + operatorController.MaintainerEpoch(), + ) if ok := operatorController.AddOperator(op); !ok { log.Error("add operator failed when dealing current working operators in bootstrap, should not happen", zap.String("nodeID", node.String()), @@ -876,6 +883,7 @@ func (c *Controller) handleCurrentWorkingRemove( spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, + operatorController.MaintainerEpoch(), nil, ) if ok := operatorController.AddOperator(op); !ok { @@ -897,6 +905,7 @@ func (c *Controller) handleCurrentWorkingRemove( spanController, replicaSet, req.OperatorType, + operatorController.MaintainerEpoch(), func() { // post finish // Mark the span absent only if it still exists. A concurrent DDL may have already removed it, // and we must not reintroduce a ghost entry into spanController. diff --git a/maintainer/maintainer_controller_helper.go b/maintainer/maintainer_controller_helper.go index 85db5b2b44..a319a3cdc9 100644 --- a/maintainer/maintainer_controller_helper.go +++ b/maintainer/maintainer_controller_helper.go @@ -212,7 +212,14 @@ func (c *Controller) splitTableByRegionCount(tableID int64, mode int64) error { } splitTableSpans := splitter.Split(context.Background(), wholeSpan, 0, split.SplitTypeRegionCount) - op := operator.NewSplitDispatcherOperator(spanController, replications[0], splitTableSpans, []node.ID{}, nil) + op := operator.NewSplitDispatcherOperator( + spanController, + replications[0], + splitTableSpans, + []node.ID{}, + operatorController.MaintainerEpoch(), + nil, + ) ret := operatorController.AddOperator(op) if !ret { return errors.ErrOperatorIsNil.GenWithStackByArgs("unexpected error in create split dispatcher operator") diff --git a/maintainer/maintainer_controller_test.go b/maintainer/maintainer_controller_test.go index d1226bd72f..56327458bc 100644 --- a/maintainer/maintainer_controller_test.go +++ b/maintainer/maintainer_controller_test.go @@ -76,7 +76,7 @@ func TestSchedule(t *testing.T) { CheckpointTs: 1, }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) - controller := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 9, time.Minute, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + controller := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 9, time.Minute, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) for i := 0; i < 10; i++ { controller.spanController.AddNewTable(commonEvent.Table{ SchemaID: 1, @@ -96,6 +96,48 @@ func TestSchedule(t *testing.T) { require.Equal(t, 3, controller.spanController.GetTaskSizeByNodeID("node3")) } +func TestNewControllerInitializesMaintainerEpoch(t *testing.T) { + testutil.SetUpTestServices(t) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlDispatcherID := common.NewDispatcherID() + redoDDLDispatcherID := common.NewDispatcherID() + ddlSpan := replica.NewWorkingSpanReplication(cfID, ddlDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: ddlDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + redoDDLSpan := replica.NewWorkingSpanReplication(cfID, redoDDLDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: redoDDLDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + const maintainerEpoch = uint64(42) + + controller := NewController( + cfID, + 1, + nil, + replicaConfig, + ddlSpan, + redoDDLSpan, + 9, + time.Minute, + replica.NewRegionCountRefresher(cfID, time.Minute), + common.DefaultKeyspace, + true, + testBalanceMoveBatchSize, + maintainerEpoch, + ) + + require.Equal(t, maintainerEpoch, controller.currentMaintainerEpoch()) + require.Equal(t, maintainerEpoch, controller.operatorController.MaintainerEpoch()) + require.Equal(t, maintainerEpoch, controller.redoOperatorController.MaintainerEpoch()) +} + // This case test the scenario that the balance scheduler when a new node join in. // In this case, the num of split tables is more than the num of nodes, // and we can select appropriate split spans to move @@ -122,7 +164,7 @@ func TestBalanceGroupsNewNodeAdd_SplitsTableMoreThanNodeNum(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) nodeID := node.ID("node1") for i := range 100 { @@ -254,7 +296,7 @@ func TestBalanceGroupsNewNodeAdd_SplitsTableLessThanNodeNum(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) regionCache := appcontext.GetService[*testutil.MockCache](appcontext.RegionCache) @@ -376,7 +418,7 @@ func TestSplitBalanceGroupsWithNodeRemove(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) nodeIDList := []node.ID{"node1", "node2", "node3"} for i := 0; i < 100; i++ { @@ -478,7 +520,7 @@ func TestSplitTableBalanceWhenTrafficUnbalanced(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) nodeIDList := []node.ID{"node1", "node2", "node3"} // make a group @@ -1079,7 +1121,7 @@ func TestBalance(t *testing.T) { CheckpointTs: 1, }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) - s := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + s := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) for i := 0; i < 100; i++ { sz := common.TableIDToComparableSpan(common.DefaultKeyspaceID, int64(i)) span := &heartbeatpb.TableSpan{TableID: sz.TableID, StartKey: sz.StartKey, EndKey: sz.EndKey} @@ -1185,7 +1227,7 @@ func TestDefaultSpanIntoSplit(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) totalSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, 1) span := &heartbeatpb.TableSpan{TableID: int64(1), StartKey: totalSpan.StartKey, EndKey: totalSpan.EndKey} dispatcherID := common.NewDispatcherID() @@ -1327,7 +1369,7 @@ func TestStoppedWhenMoving(t *testing.T) { CheckpointTs: 1, }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) - s := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + s := NewController(cfID, 1, nil, replicaConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) for i := 0; i < 2; i++ { sz := common.TableIDToComparableSpan(common.DefaultKeyspaceID, int64(i)) span := &heartbeatpb.TableSpan{TableID: sz.TableID, StartKey: sz.StartKey, EndKey: sz.EndKey} @@ -1380,7 +1422,7 @@ func TestFinishBootstrap(t *testing.T) { }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) s := NewController(cfID, 1, &mockThreadPool{}, - config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) totalSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, 1) span := &heartbeatpb.TableSpan{TableID: int64(1), StartKey: totalSpan.StartKey, EndKey: totalSpan.EndKey} schemaStore := eventservice.NewMockSchemaStore() @@ -1453,7 +1495,7 @@ func TestFinishBootstrapReturnsErrorWhenCheckpointMissing(t *testing.T) { }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) controller := NewController(cfID, 1, &mockThreadPool{}, - config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) postBootstrapRequest, err := controller.FinishBootstrap(map[node.ID]*heartbeatpb.MaintainerBootstrapResponse{ "node1": { @@ -1517,7 +1559,7 @@ func TestFinishBootstrapSkipsStaleCreateOperatorForDroppedTable(t *testing.T) { }, "node1", false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) s := NewController(cfID, 1, &mockThreadPool{}, - config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) // The schema-store snapshot is empty at bootstrap startTs, which models a table // that has already been dropped before failover recovery starts. @@ -1597,7 +1639,7 @@ func TestSplitTableWhenBootstrapFinished(t *testing.T) { MaxTrafficPercentage: util.AddressOf(1.2), } refresher := replica.NewRegionCountRefresher(cfID, time.Minute) - s := NewController(cfID, 1, nil, defaultConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + s := NewController(cfID, 1, nil, defaultConfig, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) s.taskPool = &mockThreadPool{} schemaStore := eventservice.NewMockSchemaStore() schemaStore.SetTables( @@ -1777,7 +1819,7 @@ func TestLargeTableInitialization(t *testing.T) { MinTrafficPercentage: util.AddressOf(0.8), MaxTrafficPercentage: util.AddressOf(1.2), }, - }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + }, ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 0) // Create a large table with 10000 regions totalSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, int64(1)) diff --git a/maintainer/maintainer_manager_maintainers.go b/maintainer/maintainer_manager_maintainers.go index ae53e97900..6a19897343 100644 --- a/maintainer/maintainer_manager_maintainers.go +++ b/maintainer/maintainer_manager_maintainers.go @@ -51,6 +51,10 @@ type managerMaintainerSet struct { // taskScheduler is shared by all local maintainers to run background tasks. taskScheduler threadpool.ThreadPool + // registryMu serializes registry mutations that create, replace, or fully + // close maintainers because maintainer metrics share changefeed labels across + // epochs. + registryMu sync.Mutex // registry is the in-memory changefeedID -> maintainer mapping. registry sync.Map } @@ -179,10 +183,16 @@ func (p *managerMaintainerSet) handleAddMaintainer( getDrainTarget func() (node.ID, uint64), ) *heartbeatpb.MaintainerStatus { changefeedID := common.NewChangefeedIDFromPB(req.Id) - if _, ok := p.registry.Load(changefeedID); ok { + if req.CheckpointTs == 0 { + log.Error("ignore add maintainer request with invalid checkpointTs", + zap.Stringer("changefeedID", changefeedID), + zap.Uint64("checkpointTs", req.CheckpointTs)) + return nil + } + requestEpoch := req.MaintainerEpoch + if !p.mayRegisterMaintainerForAdd(changefeedID, requestEpoch) { return nil } - info := &config.ChangeFeedInfo{} if err := json.Unmarshal(req.Config, info); err != nil { log.Error("ignore add maintainer request with invalid config", @@ -191,22 +201,19 @@ func (p *managerMaintainerSet) handleAddMaintainer( zap.Error(err)) return nil } - if req.CheckpointTs == 0 { - log.Error("ignore add maintainer request with invalid checkpointTs", - zap.Stringer("changefeedID", changefeedID), - zap.Uint64("checkpointTs", req.CheckpointTs)) - return nil + // The wire epoch is the sender capability signal. If an old coordinator sends + // epoch 0, keep the maintainer in compatibility mode even when the serialized + // config still carries a non-zero ChangeFeedInfo epoch. + info.Epoch = requestEpoch + // Create the maintainer only after epoch admission so normal duplicate + // add retries do not start short-lived goroutines or metrics. + newMaintainer := func() *Maintainer { + return NewMaintainer(changefeedID, p.conf, info, p.nodeInfo, p.taskScheduler, req.CheckpointTs, req.IsNewChangefeed, req.KeyspaceId) } - maintainer := NewMaintainer(changefeedID, p.conf, info, p.nodeInfo, p.taskScheduler, req.CheckpointTs, req.IsNewChangefeed, req.KeyspaceId) - registered, loaded := p.registry.LoadOrStore(changefeedID, maintainer) - if loaded { - // Duplicate add requests can race on the same changefeed. Drop the loser and - // stop the redundant maintainer immediately so background goroutines do not leak. - maintainer.Close() + registeredMaintainer := p.registerMaintainerForAdd(changefeedID, requestEpoch, newMaintainer) + if registeredMaintainer == nil { return nil } - - registeredMaintainer := registered.(*Maintainer) // Register the maintainer before seeding the drain snapshot so concurrent // manager-level drain fanout can always observe it in the registry. target, epoch := getDrainTarget() @@ -215,6 +222,96 @@ func (p *managerMaintainerSet) handleAddMaintainer( return nil } +// mayRegisterMaintainerForAdd performs a cheap admission check before decoding +// config and constructing a maintainer. +func (p *managerMaintainerSet) mayRegisterMaintainerForAdd( + changefeedID common.ChangeFeedID, + requestEpoch uint64, +) bool { + registered, loaded := p.registry.Load(changefeedID) + if !loaded { + return true + } + existing := registered.(*Maintainer) + allowed := canRegisterAfterExistingMaintainer(existing, requestEpoch) + if !allowed { + logRejectedAddMaintainer(changefeedID, existing, requestEpoch) + } + return allowed +} + +// registerMaintainerForAdd installs a newly created maintainer after rechecking +// epoch and stopped-state admission under the registry mutation lock. +func (p *managerMaintainerSet) registerMaintainerForAdd( + changefeedID common.ChangeFeedID, + requestEpoch uint64, + newMaintainer func() *Maintainer, +) *Maintainer { + p.registryMu.Lock() + defer p.registryMu.Unlock() + + registered, loaded := p.registry.Load(changefeedID) + if !loaded { + maintainer := newMaintainer() + p.registry.Store(changefeedID, maintainer) + return maintainer + } + existing := registered.(*Maintainer) + if !canRegisterAfterExistingMaintainer(existing, requestEpoch) { + logRejectedAddMaintainer(changefeedID, existing, requestEpoch) + return nil + } + // The old maintainer has fully stopped, so it is safe to release the + // shared metric labels before the new maintainer creates its own metric + // children for the same changefeed. + existing.Close() + maintainer := newMaintainer() + p.registry.Store(changefeedID, maintainer) + return maintainer +} + +// canRegisterAfterExistingMaintainer reports whether an add request can replace +// the existing local maintainer without overlapping two live owners. +func canRegisterAfterExistingMaintainer(existing *Maintainer, requestEpoch uint64) bool { + if !isMaintainerFullyStopped(existing) { + return false + } + return isNewerMaintainerEpoch(existing.currentMaintainerEpoch(), requestEpoch) +} + +// isNewerMaintainerEpoch applies strict epoch ordering for replacement adds. +func isNewerMaintainerEpoch(existingEpoch, requestEpoch uint64) bool { + if requestEpoch == 0 { + return false + } + if existingEpoch == 0 { + return true + } + return requestEpoch > existingEpoch +} + +// isMaintainerFullyStopped reports whether the old maintainer has finished its +// remove flow and released scheduler ownership. +func isMaintainerFullyStopped(maintainer *Maintainer) bool { + return maintainer.removed.Load() && + heartbeatpb.ComponentState(maintainer.scheduleState.Load()) == heartbeatpb.ComponentState_Stopped +} + +// logRejectedAddMaintainer emits detail only for newer requests blocked by a +// still-running local maintainer. +func logRejectedAddMaintainer(changefeedID common.ChangeFeedID, existing *Maintainer, requestEpoch uint64) { + existingEpoch := existing.currentMaintainerEpoch() + if requestEpoch <= existingEpoch || isMaintainerFullyStopped(existing) { + return + } + log.Warn("reject add maintainer request because existing maintainer is still running", + zap.Stringer("changefeedID", changefeedID), + zap.Uint64("requestMaintainerEpoch", requestEpoch), + zap.Uint64("existingMaintainerEpoch", existingEpoch), + zap.Bool("existingRemoved", existing.removed.Load()), + zap.String("existingState", heartbeatpb.ComponentState(existing.scheduleState.Load()).String())) +} + // handleRemoveMaintainer handles both normal remove and cascade-remove flows. func (p *managerMaintainerSet) handleRemoveMaintainer(msg *messaging.TargetMessage) *heartbeatpb.MaintainerStatus { req := msg.Message[0].(*heartbeatpb.RemoveMaintainerRequest) @@ -227,15 +324,28 @@ func (p *managerMaintainerSet) handleRemoveMaintainer(msg *messaging.TargetMessa zap.Stringer("changefeedID", changefeedID), zap.Any("request", req)) return &heartbeatpb.MaintainerStatus{ - ChangefeedID: req.GetId(), - State: heartbeatpb.ComponentState_Stopped, + ChangefeedID: req.GetId(), + State: heartbeatpb.ComponentState_Stopped, + MaintainerEpoch: req.MaintainerEpoch, } } // It's cascade remove, we should remove the dispatcher from all node. // Here we create a maintainer to run the remove dispatcher logic. - maintainer = NewMaintainerForRemove(changefeedID, p.conf, p.nodeInfo, p.taskScheduler, req.KeyspaceId) - p.registry.Store(changefeedID, maintainer) + p.registryMu.Lock() + maintainer, ok = p.registry.Load(changefeedID) + if !ok { + maintainer = NewMaintainerForRemove( + changefeedID, + p.conf, + p.nodeInfo, + p.taskScheduler, + req.KeyspaceId, + req.MaintainerEpoch, + ) + p.registry.Store(changefeedID, maintainer) + } + p.registryMu.Unlock() } maintainer.(*Maintainer).pushEvent(&Event{ changefeedID: changefeedID, @@ -270,19 +380,33 @@ func (p *managerMaintainerSet) buildHeartbeat() *heartbeatpb.MaintainerHeartbeat // cleanupRemovedMaintainers closes maintainers after their remove flow has finished. func (p *managerMaintainerSet) cleanupRemovedMaintainers() { p.registry.Range(func(key, value interface{}) bool { - cf := value.(*Maintainer) - if cf.removed.Load() { - cf.Close() - log.Info("maintainer removed, remove it from dynamic stream", - zap.Stringer("changefeedID", cf.changefeedID), - zap.Uint64("checkpointTs", cf.getWatermark().CheckpointTs), - ) - p.registry.Delete(key) - } + p.cleanupRemovedMaintainer(key, value) return true }) } +// cleanupRemovedMaintainer removes only the registry entry that still owns the +// shared changefeed metric labels observed by Range. +func (p *managerMaintainerSet) cleanupRemovedMaintainer(key, value interface{}) { + p.registryMu.Lock() + defer p.registryMu.Unlock() + + cf := value.(*Maintainer) + if !cf.removed.Load() { + return + } + // Range can observe a removed maintainer just before a newer epoch replaces it. + // Only the value still stored in the registry owns the shared metric labels. + if !p.registry.CompareAndDelete(key, cf) { + return + } + cf.Close() + log.Info("maintainer removed, remove it from dynamic stream", + zap.Stringer("changefeedID", cf.changefeedID), + zap.Uint64("checkpointTs", cf.getWatermark().CheckpointTs), + ) +} + // applyDispatcherDrainTarget fans out the latest node-scoped drain target to // every currently active maintainer. func (p *managerMaintainerSet) applyDispatcherDrainTarget(target node.ID, epoch uint64) { diff --git a/maintainer/maintainer_manager_test.go b/maintainer/maintainer_manager_test.go index f6efa3baf8..fb957c37b0 100644 --- a/maintainer/maintainer_manager_test.go +++ b/maintainer/maintainer_manager_test.go @@ -17,6 +17,7 @@ import ( "context" "encoding/json" "net" + "strconv" "sync" "testing" "time" @@ -37,10 +38,12 @@ import ( "github.com/pingcap/ticdc/pkg/liveness" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/messaging/proto" + "github.com/pingcap/ticdc/pkg/metrics" "github.com/pingcap/ticdc/pkg/node" "github.com/pingcap/ticdc/pkg/orchestrator" "github.com/pingcap/ticdc/pkg/pdutil" "github.com/pingcap/ticdc/server/watcher" + promtestutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/require" "google.golang.org/grpc" ) @@ -71,6 +74,211 @@ func runCancelable(t *testing.T, ctx context.Context, run func(context.Context) }) } +func newAddMaintainerRequestForEpoch( + t *testing.T, + cfID common.ChangeFeedID, + configEpoch uint64, + requestEpoch uint64, +) *heartbeatpb.AddMaintainerRequest { + t.Helper() + + info := &config.ChangeFeedInfo{ + ChangefeedID: cfID, + Config: config.GetDefaultReplicaConfig(), + Epoch: configEpoch, + } + data, err := json.Marshal(info) + require.NoError(t, err) + return &heartbeatpb.AddMaintainerRequest{ + Id: cfID.ToPB(), + Config: data, + CheckpointTs: 10, + KeyspaceId: common.DefaultKeyspaceID, + MaintainerEpoch: requestEpoch, + } +} + +func newManagerMaintainerSetForAddTest(t *testing.T) *managerMaintainerSet { + t.Helper() + + testutil.SetUpTestServices(t) + selfNode := node.NewInfo("", "") + maintainers := newManagerMaintainerSet(config.NewDefaultSchedulerConfig(), selfNode) + t.Cleanup(maintainers.closeAll) + return maintainers +} + +func cleanupMaintainerMetricsForTest(t *testing.T, cfID common.ChangeFeedID) { + t.Helper() + + cleanup := func() { + keyspace := cfID.Keyspace() + name := cfID.Name() + metrics.MaintainerGauge.DeleteLabelValues(keyspace, name) + metrics.MaintainerCheckpointTsGauge.DeleteLabelValues(keyspace, name) + metrics.MaintainerCheckpointTsLagGauge.DeleteLabelValues(keyspace, name) + metrics.MaintainerHandleEventDuration.DeleteLabelValues(keyspace, name) + metrics.MaintainerEventChLenGauge.DeleteLabelValues(keyspace, name) + metrics.MaintainerResolvedTsGauge.DeleteLabelValues(keyspace, name) + metrics.MaintainerResolvedTsLagGauge.DeleteLabelValues(keyspace, name) + + metrics.TableStateGauge.DeleteLabelValues(keyspace, name, "Absent", "default") + metrics.TableStateGauge.DeleteLabelValues(keyspace, name, "Absent", "redo") + metrics.TableStateGauge.DeleteLabelValues(keyspace, name, "Working", "default") + metrics.TableStateGauge.DeleteLabelValues(keyspace, name, "Working", "redo") + + metrics.ScheduleTaskGauge.DeleteLabelValues(keyspace, name, "default") + metrics.ScheduleTaskGauge.DeleteLabelValues(keyspace, name, "redo") + metrics.SpanCountGauge.DeleteLabelValues(keyspace, name, "default") + metrics.SpanCountGauge.DeleteLabelValues(keyspace, name, "redo") + metrics.TableCountGauge.DeleteLabelValues(keyspace, name, "default") + metrics.TableCountGauge.DeleteLabelValues(keyspace, name, "redo") + } + cleanup() + t.Cleanup(cleanup) +} + +func TestManagerMaintainerSet_AddMaintainerRejectsLiveNewerEpoch(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("reject-live-newer-epoch", common.DefaultKeyspaceName) + cleanupMaintainerMetricsForTest(t, cfID) + noDrainTarget := func() (node.ID, uint64) { return "", 0 } + keyspace, changefeed := cfID.Keyspace(), cfID.Name() + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 1, 1), noDrainTarget) + oldMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.Equal(t, uint64(1), oldMaintainer.currentMaintainerEpoch()) + require.Equal(t, float64(1), promtestutil.ToFloat64(metrics.MaintainerGauge.WithLabelValues(keyspace, changefeed))) + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 2, 2), noDrainTarget) + currentMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, oldMaintainer == currentMaintainer) + require.Equal(t, uint64(1), currentMaintainer.currentMaintainerEpoch()) + require.Equal(t, float64(1), promtestutil.ToFloat64(metrics.MaintainerGauge.WithLabelValues(keyspace, changefeed))) + + currentMaintainer.checkpointTsGauge.Set(123) + require.Equal(t, float64(123), promtestutil.ToFloat64(metrics.MaintainerCheckpointTsGauge.WithLabelValues(keyspace, changefeed))) +} + +func TestManagerMaintainerSet_AddMaintainerAfterStoppedKeepsReplacement(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("stopped-maintainer-replacement", common.DefaultKeyspaceName) + cleanupMaintainerMetricsForTest(t, cfID) + noDrainTarget := func() (node.ID, uint64) { return "", 0 } + keyspace, changefeed := cfID.Keyspace(), cfID.Name() + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 1, 1), noDrainTarget) + oldMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + + oldMaintainer.markRemoved() + oldMaintainer.scheduleState.Store(int32(heartbeatpb.ComponentState_Stopped)) + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 2, 2), noDrainTarget) + currentMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.False(t, oldMaintainer == currentMaintainer) + require.Equal(t, uint64(2), currentMaintainer.currentMaintainerEpoch()) + require.Equal(t, float64(1), promtestutil.ToFloat64(metrics.MaintainerGauge.WithLabelValues(keyspace, changefeed))) + + currentMaintainer.checkpointTsGauge.Set(456) + maintainers.cleanupRemovedMaintainer(cfID, oldMaintainer) + maintainerAfterStaleCleanup, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, currentMaintainer == maintainerAfterStaleCleanup) + require.Equal(t, float64(456), promtestutil.ToFloat64(metrics.MaintainerCheckpointTsGauge.WithLabelValues(keyspace, changefeed))) + + currentMaintainer.markRemoved() + currentMaintainer.scheduleState.Store(int32(heartbeatpb.ComponentState_Stopped)) + maintainers.cleanupRemovedMaintainer(cfID, currentMaintainer) + _, ok = maintainers.getMaintainer(cfID) + require.False(t, ok) +} + +func TestManagerMaintainerSet_AddMaintainerKeepsCompatibilityEpoch(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + noDrainTarget := func() (node.ID, uint64) { return "", 0 } + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 3, 0), noDrainTarget) + compatMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.Zero(t, compatMaintainer.currentMaintainerEpoch()) + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 4, 0), noDrainTarget) + compatMaintainerAfterRetry, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, compatMaintainer == compatMaintainerAfterRetry) +} + +func TestManagerMaintainerSet_AddMaintainerRejectsOlderEpoch(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + noDrainTarget := func() (node.ID, uint64) { return "", 0 } + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 2, 2), noDrainTarget) + currentMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.Equal(t, uint64(2), currentMaintainer.currentMaintainerEpoch()) + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 1, 1), noDrainTarget) + maintainerAfterOldAdd, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, currentMaintainer == maintainerAfterOldAdd) + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 3, 0), noDrainTarget) + maintainerAfterCompatAdd, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, currentMaintainer == maintainerAfterCompatAdd) +} + +func TestManagerMaintainerSet_AddMaintainerDoesNotCreateRejectedDuplicate(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + noDrainTarget := func() (node.ID, uint64) { return "", 0 } + + maintainers.handleAddMaintainer(newAddMaintainerRequestForEpoch(t, cfID, 2, 2), noDrainTarget) + currentMaintainer, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.Equal(t, uint64(2), currentMaintainer.currentMaintainerEpoch()) + + rejectedEpochs := []uint64{3, 2, 1, 0} + for _, requestEpoch := range rejectedEpochs { + t.Run("requestEpoch"+strconv.FormatUint(requestEpoch, 10), func(t *testing.T) { + require.False(t, maintainers.mayRegisterMaintainerForAdd(cfID, requestEpoch)) + registeredMaintainer := maintainers.registerMaintainerForAdd(cfID, requestEpoch, func() *Maintainer { + t.Fatalf("registerMaintainerForAdd created maintainer for rejected request epoch %d", requestEpoch) + return nil + }) + require.Nil(t, registeredMaintainer) + maintainerAfterRejectedAdd, ok := maintainers.getMaintainer(cfID) + require.True(t, ok) + require.True(t, currentMaintainer == maintainerAfterRejectedAdd) + }) + } +} + +func TestManagerMaintainerSet_RemoveMissingMaintainerReportsRequestEpoch(t *testing.T) { + maintainers := newManagerMaintainerSetForAddTest(t) + cfID := common.NewChangeFeedIDWithName("remove-missing", common.DefaultKeyspaceName) + req := &heartbeatpb.RemoveMaintainerRequest{ + Id: cfID.ToPB(), + MaintainerEpoch: 7, + } + msg := messaging.NewSingleTargetMessage( + node.ID("self"), + messaging.MaintainerManagerTopic, + req, + ) + + status := maintainers.handleRemoveMaintainer(msg) + require.NotNil(t, status) + require.Equal(t, heartbeatpb.ComponentState_Stopped, status.State) + require.Equal(t, uint64(7), status.MaintainerEpoch) +} + // This is a integration test for maintainer manager, it may consume a lot of time. // scale out/in close, add/remove tables func TestMaintainerSchedulesNodeChanges(t *testing.T) { diff --git a/maintainer/maintainer_test.go b/maintainer/maintainer_test.go index 434221c4ce..242c25af2f 100644 --- a/maintainer/maintainer_test.go +++ b/maintainer/maintainer_test.go @@ -138,9 +138,10 @@ func (m *mockDispatcherManager) onBootstrapRequest(msg *messaging.TargetMessage) req := msg.Message[0].(*heartbeatpb.MaintainerBootstrapRequest) m.maintainerID = msg.From response := &heartbeatpb.MaintainerBootstrapResponse{ - ChangefeedID: req.ChangefeedID, - Spans: m.bootstrapTables, - CheckpointTs: req.StartTs, + ChangefeedID: req.ChangefeedID, + Spans: m.bootstrapTables, + CheckpointTs: req.StartTs, + MaintainerEpoch: req.MaintainerEpoch, } m.changefeedID = req.ChangefeedID m.checkpointTs = req.StartTs @@ -171,6 +172,7 @@ func (m *mockDispatcherManager) onPostBootstrapRequest(msg *messaging.TargetMess ChangefeedID: req.ChangefeedID, TableTriggerEventDispatcherId: req.TableTriggerEventDispatcherId, Err: nil, + MaintainerEpoch: req.MaintainerEpoch, } err := m.mc.SendCommand(messaging.NewSingleTargetMessage( m.maintainerID, @@ -240,8 +242,9 @@ func (m *mockDispatcherManager) onDispatchRequest( func (m *mockDispatcherManager) onMaintainerCloseRequest(msg *messaging.TargetMessage) { _ = m.mc.SendCommand(messaging.NewSingleTargetMessage(msg.From, messaging.MaintainerTopic, &heartbeatpb.MaintainerCloseResponse{ - ChangefeedID: msg.Message[0].(*heartbeatpb.MaintainerCloseRequest).ChangefeedID, - Success: true, + ChangefeedID: msg.Message[0].(*heartbeatpb.MaintainerCloseRequest).ChangefeedID, + Success: true, + MaintainerEpoch: msg.Message[0].(*heartbeatpb.MaintainerCloseRequest).MaintainerEpoch, })) } @@ -260,6 +263,67 @@ func (m *mockDispatcherManager) sendHeartbeat() { } } +func TestMaintainerPostBootstrapResponseRequiresCurrentEpoch(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + m := &Maintainer{ + changefeedID: cfID, + info: &config.ChangeFeedInfo{Epoch: 2}, + postBootstrapMsg: &heartbeatpb.MaintainerPostBootstrapRequest{ + ChangefeedID: cfID.ToPB(), + MaintainerEpoch: 2, + }, + } + + m.onMaintainerPostBootstrapResponse(messaging.NewSingleTargetMessage( + node.ID("current"), + messaging.MaintainerManagerTopic, + &heartbeatpb.MaintainerPostBootstrapResponse{ + ChangefeedID: cfID.ToPB(), + MaintainerEpoch: 1, + }, + )) + require.NotNil(t, m.postBootstrapMsg) + + m.onMaintainerPostBootstrapResponse(messaging.NewSingleTargetMessage( + node.ID("current"), + messaging.MaintainerManagerTopic, + &heartbeatpb.MaintainerPostBootstrapResponse{ + ChangefeedID: cfID.ToPB(), + MaintainerEpoch: 2, + }, + )) + require.Nil(t, m.postBootstrapMsg) +} + +func TestMaintainerEpochRequestRequiresCompatOrCurrentEpoch(t *testing.T) { + compatMaintainer := &Maintainer{info: &config.ChangeFeedInfo{}} + require.True(t, compatMaintainer.isMaintainerEpochRequestAllowed(0)) + require.True(t, compatMaintainer.isMaintainerEpochRequestAllowed(2)) + + strictMaintainer := &Maintainer{info: &config.ChangeFeedInfo{Epoch: 2}} + require.False(t, strictMaintainer.isMaintainerEpochRequestAllowed(0)) + require.False(t, strictMaintainer.isMaintainerEpochRequestAllowed(1)) + require.True(t, strictMaintainer.isMaintainerEpochRequestAllowed(2)) +} + +func TestMaintainerCloseResponseIgnoredBeforeRemoving(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + m := &Maintainer{ + changefeedID: cfID, + info: &config.ChangeFeedInfo{Epoch: 2}, + closedNodes: make(map[node.ID]struct{}), + } + + m.onMaintainerCloseResponse(node.ID("old"), &heartbeatpb.MaintainerCloseResponse{ + ChangefeedID: cfID.ToPB(), + Success: true, + MaintainerEpoch: 0, + }) + + require.Empty(t, m.closedNodes) + require.False(t, m.removing.Load()) +} + func TestMaintainerSchedule(t *testing.T) { // This test exercises a single-node maintainer lifecycle: // 1) Bootstrap a changefeed via the dispatcher manager mock. @@ -383,6 +447,7 @@ func TestMaintainer_GetMaintainerStatusUsesCommittedCheckpoint(t *testing.T) { m := &Maintainer{ changefeedID: cfID, + info: &config.ChangeFeedInfo{Epoch: 3}, controller: &Controller{ spanController: spanController, }, @@ -398,6 +463,7 @@ func TestMaintainer_GetMaintainerStatusUsesCommittedCheckpoint(t *testing.T) { status := m.GetMaintainerStatus() require.Equal(t, uint64(20), status.CheckpointTs) require.Equal(t, uint64(50), status.LastSyncedTs) + require.Equal(t, uint64(3), status.MaintainerEpoch) } func TestMaintainerHeartbeatDuringRemovingSkipsFailoverRecovery(t *testing.T) { @@ -421,7 +487,7 @@ func TestMaintainerHeartbeatDuringRemovingSkipsFailoverRecovery(t *testing.T) { }, captureID, false) refresher := replica.NewRegionCountRefresher(cfID, time.Minute) controller := NewController(cfID, 10, &mockThreadPool{}, - config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize) + config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false, testBalanceMoveBatchSize, 1) totalSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, 1) dispatcherID := common.NewDispatcherID() @@ -611,6 +677,7 @@ func TestMaintainerCalculateNewCheckpointTs(t *testing.T) { replicaSet, selfNodeID, heartbeatpb.OperatorType_O_Add, + m.controller.currentMaintainerEpoch(), ))) m.removing.Store(true) diff --git a/maintainer/operator/operator_add.go b/maintainer/operator/operator_add.go index d9abe841f4..ef36cee6d5 100644 --- a/maintainer/operator/operator_add.go +++ b/maintainer/operator/operator_add.go @@ -57,7 +57,8 @@ type AddDispatcherOperator struct { // (for example merge) don't go through ScheduleDispatcherRequest and therefore don't need operatorType here. operatorType heartbeatpb.OperatorType - sendThrottler sendThrottler + maintainerEpoch uint64 + sendThrottler sendThrottler } func NewAddDispatcherOperator( @@ -65,13 +66,15 @@ func NewAddDispatcherOperator( replicaSet *replica.SpanReplication, dest node.ID, operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, ) *AddDispatcherOperator { return &AddDispatcherOperator{ - replicaSet: replicaSet, - dest: dest, - spanController: spanController, - operatorType: operatorType, - sendThrottler: newSendThrottler(), + replicaSet: replicaSet, + dest: dest, + spanController: spanController, + operatorType: operatorType, + maintainerEpoch: maintainerEpoch, + sendThrottler: newSendThrottler(), } } @@ -107,7 +110,7 @@ func (m *AddDispatcherOperator) Schedule() *messaging.TargetMessage { if !m.sendThrottler.shouldSend() { return nil } - return m.replicaSet.NewAddDispatcherMessage(m.dest, m.operatorType) + return m.replicaSet.NewAddDispatcherMessage(m.dest, m.operatorType, m.maintainerEpoch) } // OnNodeRemove is called when node offline, and the replicaset must already move to absent status and will be scheduled again diff --git a/maintainer/operator/operator_add_test.go b/maintainer/operator/operator_add_test.go index 6e8cc8ef7b..21aa53fcf5 100644 --- a/maintainer/operator/operator_add_test.go +++ b/maintainer/operator/operator_add_test.go @@ -56,7 +56,7 @@ func TestAddOperator_DestNodeRemoved(t *testing.T) { absentReplicaSet := newAddTestReplicaSet(spanController, changefeedID) - op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add) + op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add, 7) require.NotNil(t, op) op.Start() @@ -66,6 +66,8 @@ func TestAddOperator_DestNodeRemoved(t *testing.T) { msg := op.Schedule() require.NotNil(t, msg) require.Equal(t, nodeB.String(), msg.To.String()) + req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) + require.Equal(t, uint64(7), req.MaintainerEpoch) // Node B is removed before it reports working status op.OnNodeRemove(nodeB) @@ -87,7 +89,7 @@ func TestAddOperator_DestReportsWorking(t *testing.T) { spanController, changefeedID, _, _, nodeB := setupTestEnvironment(t) absentReplicaSet := newAddTestReplicaSet(spanController, changefeedID) - op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add) + op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add, 7) require.NotNil(t, op) op.Start() @@ -119,7 +121,7 @@ func TestAddOperator_DestReportsRemoved(t *testing.T) { spanController, changefeedID, _, _, nodeB := setupTestEnvironment(t) absentReplicaSet := newAddTestReplicaSet(spanController, changefeedID) - op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add) + op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add, 7) require.NotNil(t, op) op.Start() @@ -149,7 +151,7 @@ func TestAddOperator_StoppedStatusIgnored(t *testing.T) { spanController, changefeedID, _, _, nodeB := setupTestEnvironment(t) absentReplicaSet := newAddTestReplicaSet(spanController, changefeedID) - op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add) + op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add, 7) require.NotNil(t, op) op.Start() @@ -176,7 +178,7 @@ func TestAddOperator_TaskRemovedDoesNotReintroduceSpan(t *testing.T) { spanController, changefeedID, _, _, nodeB := setupTestEnvironment(t) absentReplicaSet := newAddTestReplicaSet(spanController, changefeedID) - op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add) + op := NewAddDispatcherOperator(spanController, absentReplicaSet, nodeB, heartbeatpb.OperatorType_O_Add, 7) require.NotNil(t, op) op.Start() diff --git a/maintainer/operator/operator_controller.go b/maintainer/operator/operator_controller.go index f876b41dd6..0158d15f67 100644 --- a/maintainer/operator/operator_controller.go +++ b/maintainer/operator/operator_controller.go @@ -16,13 +16,13 @@ package operator import ( "container/heap" "sync" + "sync/atomic" "time" "github.com/pingcap/log" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/maintainer/replica" "github.com/pingcap/ticdc/maintainer/span" - "github.com/pingcap/ticdc/maintainer/split" "github.com/pingcap/ticdc/pkg/common" appcontext "github.com/pingcap/ticdc/pkg/common/context" "github.com/pingcap/ticdc/pkg/messaging" @@ -45,13 +45,13 @@ var _ operator.Controller[common.DispatcherID, *heartbeatpb.TableSpanStatus] = & // Controller is the operator controller, it manages all operators. // And the Controller is responsible for the execution of the operator. type Controller struct { - role string - changefeedID common.ChangeFeedID - batchSize int - messageCenter messaging.MessageCenter - spanController *span.Controller - nodeManager *watcher.NodeManager - splitter *split.Splitter + role string + changefeedID common.ChangeFeedID + batchSize int + messageCenter messaging.MessageCenter + spanController *span.Controller + nodeManager *watcher.NodeManager + maintainerEpoch atomic.Uint64 // admissionMu serializes removing-mode quiesce with normal operator side effects. // A normal operator must hold the read side from its final allow check through @@ -136,6 +136,16 @@ func (oc *Controller) isQuiescing() bool { return oc.quiescing } +// SetMaintainerEpoch sets the epoch used by scheduler requests. +func (oc *Controller) SetMaintainerEpoch(maintainerEpoch uint64) { + oc.maintainerEpoch.Store(maintainerEpoch) +} + +// MaintainerEpoch returns the epoch used by maintainer-to-dispatcher-manager requests. +func (oc *Controller) MaintainerEpoch() uint64 { + return oc.maintainerEpoch.Load() +} + // Execute poll the operator from the queue and execute it // It will be called in the thread pool. func (oc *Controller) Execute() time.Time { @@ -183,7 +193,12 @@ func (oc *Controller) scheduleOperator(op operator.Operator[common.DispatcherID, func (oc *Controller) RemoveTasksBySchemaID(schemaID int64) { tasks := oc.spanController.GetRemoveTasksBySchemaID(schemaID) for _, task := range tasks { - oc.removeReplicaSet(newRemoveDispatcherOperator(oc.spanController, task, heartbeatpb.OperatorType_O_Remove)) + oc.removeReplicaSet(newRemoveDispatcherOperator( + oc.spanController, + task, + heartbeatpb.OperatorType_O_Remove, + oc.MaintainerEpoch(), + )) } oc.spanController.RemoveBySchemaID(schemaID) } @@ -201,7 +216,12 @@ func (oc *Controller) RemoveTasksBySchemaID(schemaID int64) { func (oc *Controller) RemoveTasksByTableIDs(tables ...int64) { tasks := oc.spanController.GetRemoveTasksByTableIDs(tables...) for _, task := range tasks { - oc.removeReplicaSet(newRemoveDispatcherOperator(oc.spanController, task, heartbeatpb.OperatorType_O_Remove)) + oc.removeReplicaSet(newRemoveDispatcherOperator( + oc.spanController, + task, + heartbeatpb.OperatorType_O_Remove, + oc.MaintainerEpoch(), + )) } oc.spanController.RemoveByTableIDs(tables...) } @@ -524,7 +544,7 @@ func (oc *Controller) checkAffectedNodes(op operator.Operator[common.DispatcherI } func (oc *Controller) NewMoveOperator(replicaSet *replica.SpanReplication, origin, dest node.ID) operator.Operator[common.DispatcherID, *heartbeatpb.TableSpanStatus] { - return NewMoveDispatcherOperator(oc.spanController, replicaSet, origin, dest) + return NewMoveDispatcherOperator(oc.spanController, replicaSet, origin, dest, oc.MaintainerEpoch()) } func checkMergeOperator(affectedReplicaSets []*replica.SpanReplication) bool { @@ -585,7 +605,7 @@ func (oc *Controller) AddMergeOperator( } } - mergeOperator := NewMergeDispatcherOperator(oc.spanController, affectedReplicaSets, operators) + mergeOperator := NewMergeDispatcherOperator(oc.spanController, affectedReplicaSets, operators, oc.MaintainerEpoch()) ret := oc.AddOperator(mergeOperator) if !ret { log.Error("failed to add merge dispatcher operator", diff --git a/maintainer/operator/operator_controller_test.go b/maintainer/operator/operator_controller_test.go index 576a6bc532..32f9a77f3f 100644 --- a/maintainer/operator/operator_controller_test.go +++ b/maintainer/operator/operator_controller_test.go @@ -62,8 +62,8 @@ func TestController_CountInflightDrainMovesFromNode(t *testing.T) { }) oc := NewOperatorController(changefeedID, spanController, 1, common.DefaultMode) - require.True(t, oc.AddOperator(NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB))) - require.True(t, oc.AddOperator(NewMoveDispatcherOperator(spanController, otherReplicaSet, nodeB, nodeA))) + require.True(t, oc.AddOperator(NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7))) + require.True(t, oc.AddOperator(NewMoveDispatcherOperator(spanController, otherReplicaSet, nodeB, nodeA, 7))) require.Equal(t, 1, oc.CountInflightDrainMovesFromNode(nodeA)) require.Equal(t, 1, oc.CountInflightDrainMovesFromNode(nodeB)) @@ -280,7 +280,7 @@ func TestController_PostFinishCalledOnceOnReplace(t *testing.T) { }() <-op.isFinishedCalled - oc.removeReplicaSet(newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove)) + oc.removeReplicaSet(newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7)) wg.Wait() require.Equal(t, int32(1), op.postFinishCount.Load()) @@ -327,6 +327,16 @@ func TestController_AddMergeOperatorFailureCleansOccupyOperators(t *testing.T) { require.Equal(t, 1, oc.OperatorSize()) } +func TestMergeDispatcherOperatorScheduleMaintainerEpoch(t *testing.T) { + spanController, toMergedReplicaSets, occupyOperators, _ := setupMergeTestEnvironment(t) + + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) + msg := op.Schedule() + + req := msg.Message[0].(*heartbeatpb.MergeDispatcherRequest) + require.Equal(t, uint64(7), req.MaintainerEpoch) +} + func TestController_RemoveReplicaSet_ReplacesRemoveOperatorOnTaskRemoved(t *testing.T) { // Scenario: the barrier can enqueue the same remove task multiple times during failover/bootstrap. // Steps: @@ -346,10 +356,11 @@ func TestController_RemoveReplicaSet_ReplacesRemoveOperatorOnTaskRemoved(t *test spanController, replicaSet, heartbeatpb.OperatorType_O_Move, + 7, func() { postFinishCount.Add(1) }, ))) - oc.removeReplicaSet(newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove)) + oc.removeReplicaSet(newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7)) require.Equal(t, int32(0), postFinishCount.Load()) require.NotNil(t, oc.GetOperator(replicaSet.ID)) diff --git a/maintainer/operator/operator_merge.go b/maintainer/operator/operator_merge.go index fb7b281de3..ea4797f6c6 100644 --- a/maintainer/operator/operator_merge.go +++ b/maintainer/operator/operator_merge.go @@ -41,10 +41,11 @@ import ( // - OnNodeRemove(originNode): abort merge, mark old replicas absent, and remove the merged replica. // - OnTaskRemoved(): abort merge due to DDL and clean up without clearing node binding of old replicas. type MergeDispatcherOperator struct { - spanController *span.Controller - originNode node.ID - id common.DispatcherID - dispatcherIDs []*heartbeatpb.DispatcherID + spanController *span.Controller + originNode node.ID + id common.DispatcherID + dispatcherIDs []*heartbeatpb.DispatcherID + maintainerEpoch uint64 // aborted indicates the merge should not be applied successfully. It can be set by OnNodeRemove // or OnTaskRemoved. When aborted is true, PostFinish follows the abort path. @@ -101,6 +102,7 @@ func NewMergeDispatcherOperator( spanController *span.Controller, toMergedReplicaSets []*replica.SpanReplication, occupyOperators []operator.Operator[common.DispatcherID, *heartbeatpb.TableSpanStatus], + maintainerEpoch uint64, ) *MergeDispatcherOperator { toMergedSpans := make([]*heartbeatpb.TableSpan, 0, len(toMergedReplicaSets)) for _, replicaSet := range toMergedReplicaSets { @@ -140,6 +142,7 @@ func NewMergeDispatcherOperator( originNode: nodeID, id: newDispatcherID, dispatcherIDs: dispatcherIDs, + maintainerEpoch: maintainerEpoch, toMergedReplicaSets: toMergedReplicaSets, checkpointTs: 0, mergedSpanInfo: spansInfo, @@ -211,6 +214,7 @@ func (m *MergeDispatcherOperator) Schedule() *messaging.TargetMessage { DispatcherIDs: m.dispatcherIDs, MergedDispatcherID: m.id.ToPB(), Mode: m.newReplicaSet.GetMode(), + MaintainerEpoch: m.maintainerEpoch, }) } diff --git a/maintainer/operator/operator_merge_test.go b/maintainer/operator/operator_merge_test.go index 93769c43a0..3bd415031a 100644 --- a/maintainer/operator/operator_merge_test.go +++ b/maintainer/operator/operator_merge_test.go @@ -147,7 +147,7 @@ func setupLargeMergeTestEnvironment( func TestMergeOperator_NodeRemovedBeforeWorking(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, nodeA := setupMergeTestEnvironment(t) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() @@ -188,7 +188,7 @@ func TestMergeOperator_NodeRemovedBeforeWorking(t *testing.T) { func TestMergeOperator_TaskRemovedByDDLBeforeWorking(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, nodeA := setupMergeTestEnvironment(t) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() @@ -218,7 +218,7 @@ func TestMergeOperator_TaskRemovedByDDLBeforeWorking(t *testing.T) { func TestMergeOperator_NewReplicaSetCheckpointTsUsesMinOfMergedReplicas(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, _ := setupMergeTestEnvironmentWithCheckpointTs(t, 1500, 1000) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) // The merged replica should inherit a safe checkpointTs to avoid regressing global checkpoint. require.Equal(t, uint64(1000), op.newReplicaSet.GetStatus().GetCheckpointTs()) @@ -231,7 +231,7 @@ func TestMergeOperator_NewReplicaSetCheckpointTsUsesMinOfMergedReplicas(t *testi func TestMergeOperator_SuccessfulMerge(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, nodeA := setupMergeTestEnvironment(t) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() @@ -267,7 +267,7 @@ func TestMergeOperator_PostFinishReleasesOccupyAfterRemovingOldReplicas(t *testi spanController, toMergedReplicaSets, occupyOperators, nodeA := setupLargeMergeTestEnvironment(t, 4096) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() @@ -311,7 +311,7 @@ func TestMergeOperator_PostFinishReleasesOccupyAfterRemovingOldReplicas(t *testi func TestMergeOperator_NodeRemovedAfterWorking(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, nodeA := setupMergeTestEnvironment(t) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() @@ -348,7 +348,7 @@ func TestMergeOperator_NodeRemovedAfterWorking(t *testing.T) { func TestMergeOperator_TaskRemovedByDDLAfterWorking(t *testing.T) { spanController, toMergedReplicaSets, occupyOperators, nodeA := setupMergeTestEnvironment(t) - op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators) + op := NewMergeDispatcherOperator(spanController, toMergedReplicaSets, occupyOperators, 7) require.NotNil(t, op) op.Start() diff --git a/maintainer/operator/operator_move.go b/maintainer/operator/operator_move.go index d45dec35bf..aa10836442 100644 --- a/maintainer/operator/operator_move.go +++ b/maintainer/operator/operator_move.go @@ -52,10 +52,11 @@ const ( // MoveDispatcherOperator is an operator to move a table span to the destination dispatcher type MoveDispatcherOperator struct { - replicaSet *replica.SpanReplication - spanController *span.Controller - origin node.ID - dest node.ID + replicaSet *replica.SpanReplication + spanController *span.Controller + origin node.ID + dest node.ID + maintainerEpoch uint64 // State transitions: // removeOrigin --(origin stopped)-> addDest --(dest working)-> doneSuccess @@ -97,13 +98,20 @@ func (m *MoveDispatcherOperator) finishAsAbsent() { m.state = moveStateDoneNoPostFinish } -func NewMoveDispatcherOperator(spanController *span.Controller, replicaSet *replica.SpanReplication, origin, dest node.ID) *MoveDispatcherOperator { +func NewMoveDispatcherOperator( + spanController *span.Controller, + replicaSet *replica.SpanReplication, + origin node.ID, + dest node.ID, + maintainerEpoch uint64, +) *MoveDispatcherOperator { return &MoveDispatcherOperator{ - replicaSet: replicaSet, - origin: origin, - dest: dest, - spanController: spanController, - sendThrottler: newSendThrottler(), + replicaSet: replicaSet, + origin: origin, + dest: dest, + spanController: spanController, + maintainerEpoch: maintainerEpoch, + sendThrottler: newSendThrottler(), } } @@ -152,9 +160,9 @@ func (m *MoveDispatcherOperator) Schedule() *messaging.TargetMessage { switch m.state { case moveStateAddDest: - return m.replicaSet.NewAddDispatcherMessage(m.dest, heartbeatpb.OperatorType_O_Move) + return m.replicaSet.NewAddDispatcherMessage(m.dest, heartbeatpb.OperatorType_O_Move, m.maintainerEpoch) case moveStateRemoveOrigin, moveStateAbortRemoveOrigin: - return m.replicaSet.NewRemoveDispatcherMessage(m.origin, heartbeatpb.OperatorType_O_Move) + return m.replicaSet.NewRemoveDispatcherMessage(m.origin, heartbeatpb.OperatorType_O_Move, m.maintainerEpoch) default: return nil } diff --git a/maintainer/operator/operator_move_test.go b/maintainer/operator/operator_move_test.go index 60558a9a07..a8c446e9d6 100644 --- a/maintainer/operator/operator_move_test.go +++ b/maintainer/operator/operator_move_test.go @@ -88,7 +88,7 @@ func setupTestEnvironment(t *testing.T) (*span.Controller, common.ChangeFeedID, func TestMoveOperator_DestNodeRemovedBeforeOriginStopped(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() @@ -109,6 +109,7 @@ func TestMoveOperator_DestNodeRemovedBeforeOriginStopped(t *testing.T) { require.True(t, ok) require.Equal(t, heartbeatpb.ScheduleAction_Remove, scheduleMsg.ScheduleAction) require.Equal(t, replicaSet.ID.ToPB(), scheduleMsg.Config.DispatcherID) + require.Equal(t, uint64(7), scheduleMsg.MaintainerEpoch) absentSizeBefore := spanController.GetAbsentSize() nonWorkingStatus := &heartbeatpb.TableSpanStatus{ @@ -131,7 +132,7 @@ func TestMoveOperator_DestNodeRemovedBeforeOriginStopped(t *testing.T) { func TestMoveOperator_DestNodeRemovedAfterOriginStopped(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() @@ -161,7 +162,7 @@ func TestMoveOperator_DestNodeRemovedAfterOriginStopped(t *testing.T) { func TestMoveOperator_OriginNodeRemovedBeforeOriginStopped(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() @@ -204,7 +205,7 @@ func TestMoveOperator_OriginNodeRemovedBeforeOriginStopped(t *testing.T) { func TestMoveOperator_OriginNodeRemovedAfterOriginStopped(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() @@ -254,7 +255,7 @@ func TestMoveOperator_BothNodesRemovedBeforeStartDoesNotLeaveSchedulingWithoutNo setAliveNodes(nodeManager, map[node.ID]*node.Info{}) oc := NewOperatorController(changefeedID, spanController, 1, common.DefaultMode) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.True(t, oc.AddOperator(op)) require.Equal(t, 1, spanController.GetAbsentSize()) @@ -270,7 +271,7 @@ func TestMoveOperator_BothNodesRemovedBeforeStartDoesNotLeaveSchedulingWithoutNo func TestMoveOperator_DestThenOriginRemovedAbortsToAbsent(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() @@ -298,7 +299,7 @@ func TestMoveOperator_TaskRemovedByDDL(t *testing.T) { spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) spanController.AddReplicatingSpan(replicaSet) - op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB, 7) require.NotNil(t, op) op.Start() diff --git a/maintainer/operator/operator_remove.go b/maintainer/operator/operator_remove.go index 43a95c21c6..5c44a0e619 100644 --- a/maintainer/operator/operator_remove.go +++ b/maintainer/operator/operator_remove.go @@ -59,33 +59,35 @@ type removeDispatcherOperator struct { // ScheduleDispatcherRequest (for example merge-related messages). operatorType heartbeatpb.OperatorType - sendThrottler sendThrottler + maintainerEpoch uint64 + sendThrottler sendThrottler } func NewRemoveDispatcherOperator( spanController *span.Controller, replicaSet *replica.SpanReplication, operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, postFinish func(), ) *removeDispatcherOperator { return &removeDispatcherOperator{ - replicaSet: replicaSet, - nodeID: replicaSet.GetNodeID(), - spanController: spanController, - postFinish: postFinish, - operatorType: operatorType, - sendThrottler: newSendThrottler(), + replicaSet: replicaSet, + nodeID: replicaSet.GetNodeID(), + spanController: spanController, + postFinish: postFinish, + operatorType: operatorType, + maintainerEpoch: maintainerEpoch, + sendThrottler: newSendThrottler(), } } -func newRemoveDispatcherOperator(spanController *span.Controller, replicaSet *replica.SpanReplication, operatorType heartbeatpb.OperatorType) *removeDispatcherOperator { - return &removeDispatcherOperator{ - replicaSet: replicaSet, - nodeID: replicaSet.GetNodeID(), - spanController: spanController, - operatorType: operatorType, - sendThrottler: newSendThrottler(), - } +func newRemoveDispatcherOperator( + spanController *span.Controller, + replicaSet *replica.SpanReplication, + operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, +) *removeDispatcherOperator { + return NewRemoveDispatcherOperator(spanController, replicaSet, operatorType, maintainerEpoch, nil) } func (m *removeDispatcherOperator) Check(from node.ID, status *heartbeatpb.TableSpanStatus) { @@ -110,7 +112,7 @@ func (m *removeDispatcherOperator) Schedule() *messaging.TargetMessage { return nil } - return m.replicaSet.NewRemoveDispatcherMessage(m.nodeID, m.operatorType) + return m.replicaSet.NewRemoveDispatcherMessage(m.nodeID, m.operatorType, m.maintainerEpoch) } // OnNodeRemove is called when node offline, and the replicaset has been removed from spanController, so it's ok. diff --git a/maintainer/operator/operator_remove_test.go b/maintainer/operator/operator_remove_test.go index 8f681016d5..2857082d37 100644 --- a/maintainer/operator/operator_remove_test.go +++ b/maintainer/operator/operator_remove_test.go @@ -28,7 +28,7 @@ import ( func TestRemoveOperator_NodeRemovedBeforeStopped(t *testing.T) { spanController, _, replicaSet, nodeA, _ := setupTestEnvironment(t) - op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove) + op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7) require.NotNil(t, op) op.Start() @@ -39,6 +39,8 @@ func TestRemoveOperator_NodeRemovedBeforeStopped(t *testing.T) { require.NotNil(t, msg) require.Equal(t, messaging.TypeScheduleDispatcherRequest, msg.Type) require.Equal(t, nodeA.String(), msg.To.String()) + req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) + require.Equal(t, uint64(7), req.MaintainerEpoch) // Node A is removed before it reports stopped status op.OnNodeRemove(nodeA) @@ -53,7 +55,7 @@ func TestRemoveOperator_SnapshotNodeIDAfterMarkAbsent(t *testing.T) { spanController, _, replicaSet, nodeA, _ := setupTestEnvironment(t) spanController.AddReplicatingSpan(replicaSet) - op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove) + op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7) require.NotNil(t, op) spanController.MarkSpanAbsent(replicaSet) @@ -75,7 +77,7 @@ func TestRemoveOperator_SnapshotNodeIDAfterMarkAbsent(t *testing.T) { func TestRemoveOperator_NotFinishedOnWaitingMerge(t *testing.T) { spanController, _, replicaSet, nodeA, _ := setupTestEnvironment(t) - op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove) + op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7) require.NotNil(t, op) waitingMergeStatus := &heartbeatpb.TableSpanStatus{ @@ -98,7 +100,7 @@ func TestRemoveOperator_NotFinishedOnWaitingMerge(t *testing.T) { func TestRemoveOperator_FinishedOnRemovedStatus(t *testing.T) { spanController, _, replicaSet, nodeA, _ := setupTestEnvironment(t) - op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove) + op := newRemoveDispatcherOperator(spanController, replicaSet, heartbeatpb.OperatorType_O_Remove, 7) require.NotNil(t, op) removedStatus := &heartbeatpb.TableSpanStatus{ diff --git a/maintainer/operator/operator_split.go b/maintainer/operator/operator_split.go index 636c7c2454..36fcef912b 100644 --- a/maintainer/operator/operator_split.go +++ b/maintainer/operator/operator_split.go @@ -52,6 +52,7 @@ type SplitDispatcherOperator struct { splitSpans []*heartbeatpb.TableSpan splitSpanInfo string splitTargetNodes []node.ID + maintainerEpoch uint64 // postFinish is invoked for each newly created span after ReplaceReplicaSet. It is mainly used by // the split-balance scheduler to schedule the new spans to specific nodes. // Note: when postFinish is not nil, splitTargetNodes is expected to be aligned with splitSpans. @@ -74,6 +75,7 @@ func NewSplitDispatcherOperator( replicaSet *replica.SpanReplication, splitSpans []*heartbeatpb.TableSpan, splitTargetNodes []node.ID, + maintainerEpoch uint64, postFinish func(span *replica.SpanReplication, node node.ID) bool, ) *SplitDispatcherOperator { var spansInfo strings.Builder @@ -89,6 +91,7 @@ func NewSplitDispatcherOperator( spanController: spanController, splitSpanInfo: spansInfo.String(), splitTargetNodes: splitTargetNodes, + maintainerEpoch: maintainerEpoch, postFinish: postFinish, sendThrottler: newSendThrottler(), } @@ -147,7 +150,7 @@ func (m *SplitDispatcherOperator) Schedule() *messaging.TargetMessage { if !m.sendThrottler.shouldSend() { return nil } - return m.replicaSet.NewRemoveDispatcherMessage(m.originNode, heartbeatpb.OperatorType_O_Split) + return m.replicaSet.NewRemoveDispatcherMessage(m.originNode, heartbeatpb.OperatorType_O_Split, m.maintainerEpoch) } // OnTaskRemoved is called when the task is removed by ddl diff --git a/maintainer/operator/operator_split_test.go b/maintainer/operator/operator_split_test.go index e0f3971faa..30ab9797e5 100644 --- a/maintainer/operator/operator_split_test.go +++ b/maintainer/operator/operator_split_test.go @@ -49,7 +49,7 @@ func TestSplitOperator_OriginNodeRemovedBeforeStopped(t *testing.T) { // Split targets are empty, meaning let scheduler decide splitTargetNodes := []node.ID{"", ""} - op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, nil) + op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, 7, nil) require.NotNil(t, op) op.Start() @@ -100,7 +100,7 @@ func TestSplitOperator_OriginNodeRemovedAfterStopped(t *testing.T) { // Split targets are empty, meaning let scheduler decide splitTargetNodes := []node.ID{"", ""} - op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, nil) + op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, 7, nil) require.NotNil(t, op) op.Start() @@ -153,7 +153,7 @@ func TestSplitOperator_SuccessfulSplitCreatesAbsentSpans(t *testing.T) { }, } - op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, []node.ID{}, nil) + op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, []node.ID{}, 7, nil) require.NotNil(t, op) op.Start() @@ -244,7 +244,7 @@ func TestSplitOperator_SuccessfulSplitToSchedulingTargets(t *testing.T) { } splitTargetNodes := []node.ID{nodeA, nodeB} - op := NewSplitDispatcherOperator(spanController, replicaSetToSplit, splitSpans, splitTargetNodes, nil) + op := NewSplitDispatcherOperator(spanController, replicaSetToSplit, splitSpans, splitTargetNodes, 7, nil) require.NotNil(t, op) op.Start() @@ -337,7 +337,7 @@ func TestSplitOperator_PostFinishCallbackFailureMarksSpanAbsent(t *testing.T) { } splitTargetNodes := []node.ID{nodeA, nodeB} - op := NewSplitDispatcherOperator(spanController, replicaSetToSplit, splitSpans, splitTargetNodes, + op := NewSplitDispatcherOperator(spanController, replicaSetToSplit, splitSpans, splitTargetNodes, 7, func(_ *replica.SpanReplication, target node.ID) bool { return target != nodeB }) @@ -394,7 +394,7 @@ func TestSplitOperator_PostFinishSkipsWhenTargetNodesMismatch(t *testing.T) { splitTargetNodes := []node.ID{nodeA} postFinishCalled := 0 - op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, + op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, splitTargetNodes, 7, func(_ *replica.SpanReplication, _ node.ID) bool { postFinishCalled++ return true @@ -445,7 +445,7 @@ func TestSplitOperator_TaskRemovedByDDLDoesNotSplit(t *testing.T) { }, } - op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, []node.ID{}, nil) + op := NewSplitDispatcherOperator(spanController, replicaSet, splitSpans, []node.ID{}, 7, nil) require.NotNil(t, op) op.Start() diff --git a/maintainer/replica/replication_span.go b/maintainer/replica/replication_span.go index e35aa9ac40..c94a8bd8d7 100644 --- a/maintainer/replica/replication_span.go +++ b/maintainer/replica/replication_span.go @@ -260,7 +260,11 @@ func (r *SpanReplication) getCommittedCheckpointTs() uint64 { // moved/recreated during an in-flight barrier (DDL or syncpoint), starting from the raw checkpoint can // violate barrier semantics (e.g. replaying events that have already been acknowledged by the barrier), // so we adjust StartTs and (for DDL) optionally skip DML at StartTs+1. -func (r *SpanReplication) NewAddDispatcherMessage(server node.ID, operatorType heartbeatpb.OperatorType) *messaging.TargetMessage { +func (r *SpanReplication) NewAddDispatcherMessage( + server node.ID, + operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, +) *messaging.TargetMessage { startTs := r.status.Load().CheckpointTs skipDMLAsStartTs := false ddlBarrierBlockTs := uint64(0) @@ -330,22 +334,35 @@ func (r *SpanReplication) NewAddDispatcherMessage(server node.ID, operatorType h SkipDMLAsStartTs: skipDMLAsStartTs, Mode: r.GetMode(), }, - ScheduleAction: heartbeatpb.ScheduleAction_Create, - OperatorType: operatorType, + ScheduleAction: heartbeatpb.ScheduleAction_Create, + OperatorType: operatorType, + MaintainerEpoch: maintainerEpoch, }) } // NewRemoveDispatcherMessage creates a ScheduleDispatcherRequest(Remove) for this span. // Span and OperatorType are included so a new maintainer can reconstruct intent during bootstrap/failover, // even if the dispatcher has already disappeared from the node span snapshot. -func (r *SpanReplication) NewRemoveDispatcherMessage(server node.ID, operatorType heartbeatpb.OperatorType) *messaging.TargetMessage { - return NewRemoveDispatcherMessage(server, r.ChangefeedID, r.ID.ToPB(), r.Span, r.GetMode(), operatorType) +func (r *SpanReplication) NewRemoveDispatcherMessage( + server node.ID, + operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, +) *messaging.TargetMessage { + return NewRemoveDispatcherMessage(server, r.ChangefeedID, r.ID.ToPB(), r.Span, r.GetMode(), operatorType, maintainerEpoch) } // NewRemoveDispatcherMessage creates a ScheduleDispatcherRequest(Remove) for a dispatcherID. // The span is optional for the dispatcher manager, but is useful for maintainer bootstrap to correlate // in-flight remove requests with table spans when the dispatcher no longer exists. -func NewRemoveDispatcherMessage(server node.ID, cfID common.ChangeFeedID, dispatcherID *heartbeatpb.DispatcherID, span *heartbeatpb.TableSpan, mode int64, operatorType heartbeatpb.OperatorType) *messaging.TargetMessage { +func NewRemoveDispatcherMessage( + server node.ID, + cfID common.ChangeFeedID, + dispatcherID *heartbeatpb.DispatcherID, + span *heartbeatpb.TableSpan, + mode int64, + operatorType heartbeatpb.OperatorType, + maintainerEpoch uint64, +) *messaging.TargetMessage { return messaging.NewSingleTargetMessage(server, messaging.HeartbeatCollectorTopic, &heartbeatpb.ScheduleDispatcherRequest{ @@ -355,7 +372,8 @@ func NewRemoveDispatcherMessage(server node.ID, cfID common.ChangeFeedID, dispat Span: span, Mode: mode, }, - ScheduleAction: heartbeatpb.ScheduleAction_Remove, - OperatorType: operatorType, + ScheduleAction: heartbeatpb.ScheduleAction_Remove, + OperatorType: operatorType, + MaintainerEpoch: maintainerEpoch, }) } diff --git a/maintainer/replica/replication_span_test.go b/maintainer/replica/replication_span_test.go index b7802485a2..151bde78aa 100644 --- a/maintainer/replica/replication_span_test.go +++ b/maintainer/replica/replication_span_test.go @@ -36,11 +36,12 @@ func TestNewRemoveDispatcherMessage(t *testing.T) { t.Parallel() replicaSet := NewSpanReplication(common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName), common.NewDispatcherID(), 1, getTableSpanByID(4), 10, common.DefaultMode, false) - msg := replicaSet.NewRemoveDispatcherMessage("node1", heartbeatpb.OperatorType_O_Remove) + msg := replicaSet.NewRemoveDispatcherMessage("node1", heartbeatpb.OperatorType_O_Remove, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, heartbeatpb.ScheduleAction_Remove, req.ScheduleAction) require.Equal(t, replicaSet.ID.ToPB(), req.Config.DispatcherID) require.Equal(t, replicaSet.Span, req.Config.Span) + require.Equal(t, uint64(7), req.MaintainerEpoch) require.Equal(t, "node1", msg.To.String()) } @@ -49,7 +50,7 @@ func TestSpanReplication_NewAddDispatcherMessage(t *testing.T) { replicaSet := NewSpanReplication(common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName), common.NewDispatcherID(), 1, getTableSpanByID(4), 10, common.DefaultMode, false) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) require.Equal(t, "node1", msg.To.String()) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, heartbeatpb.ScheduleAction_Create, req.ScheduleAction) @@ -57,6 +58,7 @@ func TestSpanReplication_NewAddDispatcherMessage(t *testing.T) { require.Equal(t, replicaSet.schemaID, req.Config.SchemaID) require.Equal(t, uint64(10), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) + require.Equal(t, uint64(7), req.MaintainerEpoch) } func TestSpanReplication_NewAddDispatcherMessage_ClampToCommittedCheckpoint(t *testing.T) { @@ -65,7 +67,7 @@ func TestSpanReplication_NewAddDispatcherMessage_ClampToCommittedCheckpoint(t *t replicaSet := NewSpanReplication(common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName), common.NewDispatcherID(), 1, getTableSpanByID(4), 10, common.DefaultMode, false) replicaSet.BindCommittedCheckpointTs(atomic.NewUint64(20)) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(20), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) @@ -82,7 +84,7 @@ func TestSpanReplication_NewAddDispatcherMessage_UseBlockTsForInFlightSyncPoint( Stage: heartbeatpb.BlockStage_WAITING, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(10), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) @@ -100,7 +102,7 @@ func TestSpanReplication_NewAddDispatcherMessage_UseSyncPointBlockTsWhenCommitte Stage: heartbeatpb.BlockStage_WAITING, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(30), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) @@ -117,7 +119,7 @@ func TestSpanReplication_NewAddDispatcherMessage_DontUseBlockTsAfterSyncPointDon Stage: heartbeatpb.BlockStage_DONE, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(20), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) @@ -134,7 +136,7 @@ func TestSpanReplication_NewAddDispatcherMessage_UseBlockTsMinusOneForDDLInFligh Stage: heartbeatpb.BlockStage_WAITING, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(9), req.Config.StartTs) require.True(t, req.Config.SkipDMLAsStartTs) @@ -152,7 +154,7 @@ func TestSpanReplication_NewAddDispatcherMessage_UseCommittedCheckpointForStaleD Stage: heartbeatpb.BlockStage_WAITING, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(10), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) @@ -170,7 +172,7 @@ func TestSpanReplication_NewAddDispatcherMessage_UseCommittedCheckpointForStaleD Stage: heartbeatpb.BlockStage_WRITING, }) - msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := replicaSet.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(11), req.Config.StartTs) require.False(t, req.Config.SkipDMLAsStartTs) diff --git a/maintainer/scheduler/balance.go b/maintainer/scheduler/balance.go index 734482e04a..b60114e8d8 100644 --- a/maintainer/scheduler/balance.go +++ b/maintainer/scheduler/balance.go @@ -148,7 +148,14 @@ func (s *balanceScheduler) doSplit(results pkgReplica.GroupCheckResult) int { spansNum := max(result.SpanNum, len(s.nodeManager.GetAliveNodes())*2) splitSpans := s.splitter.Split(context.Background(), result.Span.Span, spansNum, result.SpanType) if len(splitSpans) > 1 { - op := operator.NewSplitDispatcherOperator(s.spanController, result.Span, splitSpans, []node.ID{}, nil) + op := operator.NewSplitDispatcherOperator( + s.spanController, + result.Span, + splitSpans, + []node.ID{}, + s.operatorController.MaintainerEpoch(), + nil, + ) ret := s.operatorController.AddOperator(op) if ret { splitCount++ @@ -160,6 +167,6 @@ func (s *balanceScheduler) doSplit(results pkgReplica.GroupCheckResult) int { } func (s *balanceScheduler) doMove(replication *replica.SpanReplication, id node.ID) bool { - op := operator.NewMoveDispatcherOperator(s.spanController, replication, replication.GetNodeID(), id) + op := s.operatorController.NewMoveOperator(replication, replication.GetNodeID(), id) return s.operatorController.AddOperator(op) } diff --git a/maintainer/scheduler/balance_splits.go b/maintainer/scheduler/balance_splits.go index 7f808a01a9..7898fdf5de 100644 --- a/maintainer/scheduler/balance_splits.go +++ b/maintainer/scheduler/balance_splits.go @@ -132,9 +132,22 @@ func (s *balanceSplitsScheduler) Execute() time.Time { case replica.OpSplit: splitSpans := s.splitter.Split(context.Background(), checkResult.SplitSpan.Span, checkResult.SpanNum, checkResult.SpanType) if len(splitSpans) > 1 { - op := operator.NewSplitDispatcherOperator(s.spanController, checkResult.SplitSpan, splitSpans, checkResult.SplitTargetNodes, func(span *replica.SpanReplication, node node.ID) bool { - return s.operatorController.AddOperator(operator.NewAddDispatcherOperator(s.spanController, span, node, heartbeatpb.OperatorType_O_Split)) - }) + op := operator.NewSplitDispatcherOperator( + s.spanController, + checkResult.SplitSpan, + splitSpans, + checkResult.SplitTargetNodes, + s.operatorController.MaintainerEpoch(), + func(span *replica.SpanReplication, node node.ID) bool { + return s.operatorController.AddOperator(operator.NewAddDispatcherOperator( + s.spanController, + span, + node, + heartbeatpb.OperatorType_O_Split, + s.operatorController.MaintainerEpoch(), + )) + }, + ) ret := s.operatorController.AddOperator(op) if ret { availableSize-- @@ -142,7 +155,7 @@ func (s *balanceSplitsScheduler) Execute() time.Time { } case replica.OpMove: for _, span := range checkResult.MoveSpans { - op := operator.NewMoveDispatcherOperator(s.spanController, span, span.GetNodeID(), checkResult.TargetNode) + op := s.operatorController.NewMoveOperator(span, span.GetNodeID(), checkResult.TargetNode) ret := s.operatorController.AddOperator(op) if ret { availableSize-- diff --git a/maintainer/scheduler/basic.go b/maintainer/scheduler/basic.go index 344452309c..449516a31d 100644 --- a/maintainer/scheduler/basic.go +++ b/maintainer/scheduler/basic.go @@ -171,7 +171,13 @@ func (s *basicScheduler) schedule( absentReplications := s.spanController.GetAbsentByGroup(groupID, availableSize) pkgScheduler.BasicSchedule(availableSize, absentReplications, nodeSize, func(replication *replica.SpanReplication, id node.ID) bool { - return s.operatorController.AddOperator(operator.NewAddDispatcherOperator(s.spanController, replication, id, heartbeatpb.OperatorType_O_Add)) + return s.operatorController.AddOperator(operator.NewAddDispatcherOperator( + s.spanController, + replication, + id, + heartbeatpb.OperatorType_O_Add, + s.operatorController.MaintainerEpoch(), + )) }) return len(absentReplications) } diff --git a/maintainer/scheduler/drain.go b/maintainer/scheduler/drain.go index 6f2e57f685..bfe07daa8d 100644 --- a/maintainer/scheduler/drain.go +++ b/maintainer/scheduler/drain.go @@ -152,7 +152,7 @@ func (s *drainScheduler) Execute() time.Time { } if s.operatorController.AddOperator( - operator.NewMoveDispatcherOperator(s.spanController, replication, target, dest), + s.operatorController.NewMoveOperator(replication, target, dest), ) { nodeTaskSize[target]-- nodeTaskSize[dest]++ diff --git a/maintainer/scheduler/drain_test.go b/maintainer/scheduler/drain_test.go index 1ee5243430..8e6f246f61 100644 --- a/maintainer/scheduler/drain_test.go +++ b/maintainer/scheduler/drain_test.go @@ -107,7 +107,7 @@ func TestDrainSchedulerIgnoresUnrelatedOperatorCapacity(t *testing.T) { onTarget := addReplicatingSpan(t, cfID, sc, 1, target) unrelated := addReplicatingSpan(t, cfID, sc, 2, other) - require.True(t, oc.AddOperator(operator.NewMoveDispatcherOperator(sc, unrelated, other, dest))) + require.True(t, oc.AddOperator(operator.NewMoveDispatcherOperator(sc, unrelated, other, dest, oc.MaintainerEpoch()))) drainState.SetSelfNodeID(self) drainState.SetDispatcherDrainTarget(target, 1) diff --git a/maintainer/span/span_controller_test.go b/maintainer/span/span_controller_test.go index 827f175e12..3453f34596 100644 --- a/maintainer/span/span_controller_test.go +++ b/maintainer/span/span_controller_test.go @@ -487,7 +487,7 @@ func TestController_BindCommittedCheckpointToManagedSpan(t *testing.T) { task := controller.GetTasksByTableID(100)[0] controller.AdvanceMaintainerCommittedCheckpointTs(20) - msg := task.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + msg := task.NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add, 7) req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) require.Equal(t, uint64(20), req.Config.StartTs) } From 0d8b952571b78244a96f5c76dc63b4681e8b93e4 Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Mon, 22 Jun 2026 14:57:36 +0800 Subject: [PATCH 3/5] coordinator: address resume review feedback --- coordinator/changefeed/changefeed.go | 28 +++++--- .../changefeed/changefeed_db_backend.go | 7 +- coordinator/changefeed/changefeed_test.go | 18 +++++ coordinator/changefeed/etcd_backend.go | 20 +++++- coordinator/changefeed/etcd_backend_test.go | 68 +++++++++++++++++++ .../changefeed/mock/changefeed_db_backend.go | 15 ++++ coordinator/controller.go | 12 ++-- coordinator/controller_test.go | 32 ++++----- coordinator/coordinator_test.go | 36 +--------- pkg/common/maintainer_epoch.go | 15 ++++ pkg/common/maintainer_epoch_test.go | 36 ++++++++++ pkg/pdutil/utils.go | 11 --- pkg/pdutil/utils_test.go | 16 ----- 13 files changed, 210 insertions(+), 104 deletions(-) create mode 100644 pkg/common/maintainer_epoch_test.go diff --git a/coordinator/changefeed/changefeed.go b/coordinator/changefeed/changefeed.go index 42fd6c1f35..97df589db2 100644 --- a/coordinator/changefeed/changefeed.go +++ b/coordinator/changefeed/changefeed.go @@ -55,6 +55,12 @@ func NewChangefeed(cfID common.ChangeFeedID, checkpointTs uint64, isNew bool, ) *Changefeed { + if info == nil { + log.Panic("changefeed info is nil", zap.Stringer("changefeedID", cfID)) + } + if info.Config == nil { + log.Panic("changefeed config is nil", zap.Stringer("changefeedID", cfID)) + } uri, err := url.Parse(info.SinkURI) if err != nil { log.Panic("unable to parse sink-uri", @@ -91,7 +97,7 @@ func NewChangefeed(cfID common.ChangeFeedID, // GetInfo returns the latest ChangeFeedInfo stored in memory. // -// It may return nil if the changefeed hasn't been fully initialized. +// Changefeed keeps info non-nil after construction. func (c *Changefeed) GetInfo() *config.ChangeFeedInfo { if c == nil || c.info == nil { return nil @@ -100,15 +106,13 @@ func (c *Changefeed) GetInfo() *config.ChangeFeedInfo { } // SetInfo updates the in-memory ChangeFeedInfo for the changefeed. -// -// It lazily initializes the internal pointer for uninitialized changefeeds -// (primarily used by unit tests). -// -// If the receiver is nil, it does nothing. func (c *Changefeed) SetInfo(info *config.ChangeFeedInfo) { if c == nil { return } + if info == nil { + log.Panic("changefeed info is nil", zap.Stringer("changefeedID", c.ID)) + } if c.info == nil { c.info = atomic.NewPointer(info) return @@ -238,7 +242,8 @@ func (c *Changefeed) GetStatusForResume() *heartbeatpb.MaintainerStatus { FeedState: status.FeedState, State: status.State, MaintainerEpoch: status.MaintainerEpoch, - // Old errors are meaningless for resume and can only block the resumed task. + // Resume creates a new maintainer owner, so errors reported by the + // previous owner must not block the resumed in-memory status. Err: []*heartbeatpb.RunningError{}, } @@ -265,18 +270,19 @@ func (c *Changefeed) GetLastSavedCheckPointTs() uint64 { func (c *Changefeed) NewAddMaintainerMessage(server node.ID) *messaging.TargetMessage { info := c.GetInfo() - if info == nil { - log.Panic("changefeed info is nil", zap.String("changefeedID", c.ID.String())) - } configData, err := info.MarshalWithTruncation(false) if err != nil { log.Panic("unable to marshal changefeed config", zap.Error(err)) } + checkpointTs := c.GetLastSavedCheckPointTs() + if status := c.GetStatus(); status != nil { + checkpointTs = status.CheckpointTs + } return messaging.NewSingleTargetMessage(server, messaging.MaintainerManagerTopic, &heartbeatpb.AddMaintainerRequest{ Id: c.ID.ToPB(), - CheckpointTs: c.GetStatus().CheckpointTs, + CheckpointTs: checkpointTs, Config: []byte(configData), IsNewChangefeed: c.isNew, KeyspaceId: info.KeyspaceID, diff --git a/coordinator/changefeed/changefeed_db_backend.go b/coordinator/changefeed/changefeed_db_backend.go index dcfd253e23..e43db7de4d 100644 --- a/coordinator/changefeed/changefeed_db_backend.go +++ b/coordinator/changefeed/changefeed_db_backend.go @@ -43,8 +43,11 @@ type Backend interface { CreateChangefeed(ctx context.Context, info *config.ChangeFeedInfo) error // UpdateChangefeed updates changefeed info to db UpdateChangefeed(ctx context.Context, info *config.ChangeFeedInfo, checkpointTs uint64, progress config.Progress) error - // BumpChangefeedEpoch persists a strictly newer epoch using the latest stored - // ChangeFeedInfo. It only reads and updates stored status when UpdateStatus is set. + // ResumeChangefeed persists the resumed status with a new owner epoch. + ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, checkpointTs uint64) (*config.ChangeFeedInfo, error) + // BumpChangefeedEpoch is the low-level ownership boundary used before a + // coordinator path can create a new maintainer owner. It only reads and + // updates stored status when UpdateStatus is set. BumpChangefeedEpoch(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, options EpochBumpOptions) (*config.ChangeFeedInfo, error) // PauseChangefeed persists the pause status to db for a changefeed PauseChangefeed(ctx context.Context, id common.ChangeFeedID) error diff --git a/coordinator/changefeed/changefeed_test.go b/coordinator/changefeed/changefeed_test.go index dc8227e2c2..3bbc48dab7 100644 --- a/coordinator/changefeed/changefeed_test.go +++ b/coordinator/changefeed/changefeed_test.go @@ -43,6 +43,21 @@ func TestNewChangefeed(t *testing.T) { require.True(t, cf.NeedCheckpointTsMessage()) } +func TestNewChangefeedRejectsInvalidInfo(t *testing.T) { + t.Parallel() + + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + require.Panics(t, func() { + NewChangefeed(cfID, nil, 100, true) + }) + require.Panics(t, func() { + NewChangefeed(cfID, &config.ChangeFeedInfo{ + SinkURI: "kafka://127.0.0.1:9092", + State: config.StateNormal, + }, 100, true) + }) +} + func TestChangefeed_GetSetInfo(t *testing.T) { cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) info := &config.ChangeFeedInfo{ @@ -59,6 +74,9 @@ func TestChangefeed_GetSetInfo(t *testing.T) { } cf.SetInfo(newInfo) require.Equal(t, newInfo, cf.GetInfo()) + require.Panics(t, func() { + cf.SetInfo(nil) + }) } func TestChangefeed_GetSetNodeID(t *testing.T) { diff --git a/coordinator/changefeed/etcd_backend.go b/coordinator/changefeed/etcd_backend.go index 29064b2461..fcb45abcb0 100644 --- a/coordinator/changefeed/etcd_backend.go +++ b/coordinator/changefeed/etcd_backend.go @@ -25,7 +25,6 @@ import ( "github.com/pingcap/ticdc/pkg/config" cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/etcd" - "github.com/pingcap/ticdc/pkg/pdutil" clientv3 "go.etcd.io/etcd/client/v3" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -245,7 +244,7 @@ func (b *EtcdBackend) BumpChangefeedEpoch( // Keep compatibility defaults when the bumped info replaces the // coordinator's in-memory copy after an upgrade. info.VerifyAndComplete() - epoch, err := pdutil.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) + epoch, err := common.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) if err != nil { return nil, errors.Trace(err) } @@ -325,6 +324,23 @@ func (b *EtcdBackend) BumpChangefeedEpoch( return nil, errors.Trace(err) } +// ResumeChangefeed persists the resumed state with a new owner epoch. +func (b *EtcdBackend) ResumeChangefeed( + ctx context.Context, + id common.ChangeFeedID, + candidateEpoch uint64, + checkpointTs uint64, +) (*config.ChangeFeedInfo, error) { + normalState := config.StateNormal + return b.BumpChangefeedEpoch(ctx, id, candidateEpoch, EpochBumpOptions{ + CheckpointTs: checkpointTs, + Progress: config.ProgressNone, + UpdateStatus: true, + State: &normalState, + UpdateError: true, + }) +} + func (b *EtcdBackend) PauseChangefeed(ctx context.Context, id common.ChangeFeedID) error { info, err := b.etcdClient.GetChangeFeedInfo(ctx, id.DisplayName) if err != nil { diff --git a/coordinator/changefeed/etcd_backend_test.go b/coordinator/changefeed/etcd_backend_test.go index dd90d4f38a..a316fc80e9 100644 --- a/coordinator/changefeed/etcd_backend_test.go +++ b/coordinator/changefeed/etcd_backend_test.go @@ -267,6 +267,74 @@ func TestBumpChangefeedEpochUpdatesStatus(t *testing.T) { require.Equal(t, uint64(9), got.Epoch) } +func TestResumeChangefeed(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + cdcClient := etcd.NewMockCDCEtcdClient(ctrl) + etcdClient := etcd.NewMockClient(ctrl) + cdcClient.EXPECT().GetEtcdClient().Return(etcdClient).AnyTimes() + cdcClient.EXPECT().GetClusterID().Return("test-cluster-id").AnyTimes() + backend := NewEtcdBackend(cdcClient) + + changefeedID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + info := &config.ChangeFeedInfo{ + ChangefeedID: changefeedID, + Config: config.GetDefaultReplicaConfig(), + State: config.StateFailed, + Error: &config.RunningError{Message: "old error"}, + Epoch: 8, + } + value, err := info.Marshal() + require.NoError(t, err) + infoKey := etcd.GetEtcdKeyChangeFeedInfo("test-cluster-id", changefeedID.DisplayName) + persistedStatus := &config.ChangeFeedStatus{ + CheckpointTs: 200, + Progress: config.ProgressStopping, + } + + etcdClient.EXPECT(). + Get(gomock.Any(), infoKey). + Return(&clientv3.GetResponse{ + Kvs: []*mvccpb.KeyValue{{ + Value: []byte(value), + ModRevision: 3, + }}, + }, nil). + Times(1) + cdcClient.EXPECT(). + GetChangeFeedStatus(gomock.Any(), changefeedID). + Return(persistedStatus, int64(5), nil). + Times(1) + etcdClient.EXPECT(). + Txn(gomock.Any(), gomock.Len(2), NewFuncMatcher(func(i any) bool { + ops := i.([]clientv3.Op) + require.Len(t, ops, 2) + require.True(t, ops[0].IsPut()) + require.True(t, ops[1].IsPut()) + + persistedInfo := &config.ChangeFeedInfo{} + require.NoError(t, persistedInfo.Unmarshal(ops[0].ValueBytes())) + require.Equal(t, uint64(9), persistedInfo.Epoch) + require.Equal(t, config.StateNormal, persistedInfo.State) + require.Nil(t, persistedInfo.Error) + + status := &config.ChangeFeedStatus{} + require.NoError(t, status.Unmarshal(ops[1].ValueBytes())) + require.Equal(t, uint64(300), status.CheckpointTs) + require.Equal(t, config.ProgressNone, status.Progress) + return true + }), gomock.Len(0)). + Return(&clientv3.TxnResponse{Succeeded: true}, nil). + Times(1) + + got, err := backend.ResumeChangefeed(context.Background(), changefeedID, 9, 300) + require.NoError(t, err) + require.Equal(t, uint64(9), got.Epoch) + require.Equal(t, config.StateNormal, got.State) + require.Nil(t, got.Error) +} + func TestBumpChangefeedEpochRetriesOnCASConflict(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() diff --git a/coordinator/changefeed/mock/changefeed_db_backend.go b/coordinator/changefeed/mock/changefeed_db_backend.go index bd967e496d..b6f9131f56 100644 --- a/coordinator/changefeed/mock/changefeed_db_backend.go +++ b/coordinator/changefeed/mock/changefeed_db_backend.go @@ -124,6 +124,21 @@ func (mr *MockBackendMockRecorder) PauseChangefeed(ctx, id interface{}) *gomock. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PauseChangefeed", reflect.TypeOf((*MockBackend)(nil).PauseChangefeed), ctx, id) } +// ResumeChangefeed mocks base method. +func (m *MockBackend) ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, checkpointTs uint64) (*config.ChangeFeedInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ResumeChangefeed", ctx, id, candidateEpoch, checkpointTs) + ret0, _ := ret[0].(*config.ChangeFeedInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ResumeChangefeed indicates an expected call of ResumeChangefeed. +func (mr *MockBackendMockRecorder) ResumeChangefeed(ctx, id, candidateEpoch, checkpointTs interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResumeChangefeed", reflect.TypeOf((*MockBackend)(nil).ResumeChangefeed), ctx, id, candidateEpoch, checkpointTs) +} + // SetChangefeedProgress mocks base method. func (m *MockBackend) SetChangefeedProgress(ctx context.Context, id common.ChangeFeedID, progress config.Progress) error { m.ctrl.T.Helper() diff --git a/coordinator/controller.go b/coordinator/controller.go index a9750698d0..da7d31e5e0 100644 --- a/coordinator/controller.go +++ b/coordinator/controller.go @@ -1012,14 +1012,7 @@ func (c *Controller) ResumeChangefeed( checkpointTs = newCheckpointTs } epoch := pdutil.GenerateChangefeedEpoch(ctx, c.pdClient) - normalState := config.StateNormal - info, err := c.backend.BumpChangefeedEpoch(ctx, id, epoch, changefeed.EpochBumpOptions{ - CheckpointTs: checkpointTs, - Progress: config.ProgressNone, - UpdateStatus: true, - State: &normalState, - UpdateError: true, - }) + info, err := c.backend.ResumeChangefeed(ctx, id, epoch, checkpointTs) if err != nil { return errors.Trace(err) } @@ -1191,6 +1184,9 @@ func (c *Controller) updateChangefeedEpoch( if err != nil { return errors.Trace(err) } + if info == nil { + return errors.New("bumped changefeed info is nil") + } cf.SetInfo(info) return nil } diff --git a/coordinator/controller_test.go b/coordinator/controller_test.go index b72dc7fdc5..e57604841f 100644 --- a/coordinator/controller_test.go +++ b/coordinator/controller_test.go @@ -582,11 +582,12 @@ func TestResumeChangefeed(t *testing.T) { // no changefeed require.NotNil(t, controller.ResumeChangefeed(context.Background(), common.NewChangeFeedIDWithName("test2", common.DefaultKeyspaceName), 12, true)) - backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()).Return(nil, errors.New("failed")).Times(1) + backend.EXPECT().ResumeChangefeed(gomock.Any(), cfID, gomock.Any(), uint64(12)). + Return(nil, errors.New("failed")).Times(1) require.NotNil(t, controller.ResumeChangefeed(context.Background(), cfID, 12, true)) require.Equal(t, config.StateFailed, changefeedDB.GetByID(cfID).GetInfo().State) - expectResumeEpochBump(t, backend, cfID, cf, 12) + expectResumeChangefeed(t, backend, cfID, cf, 12) require.Nil(t, controller.ResumeChangefeed(context.Background(), cfID, 12, false)) require.Equal(t, config.StateNormal, changefeedDB.GetByID(cfID).GetInfo().State) } @@ -640,7 +641,7 @@ func TestResumeChangefeedOverwriteUpdatesLastSavedCheckpointTs(t *testing.T) { changefeedDB.AddStoppedChangefeed(cf) newCheckpointTs := uint64(120) - expectResumeEpochBump(t, backend, cfID, cf, newCheckpointTs) + expectResumeChangefeed(t, backend, cfID, cf, newCheckpointTs) require.Nil(t, controller.ResumeChangefeed(context.Background(), cfID, newCheckpointTs, true)) require.Equal(t, newCheckpointTs, changefeedDB.GetByID(cfID).GetLastSavedCheckPointTs()) } @@ -677,7 +678,7 @@ func TestResumeChangefeedIgnoresStaleMaintainerErrorAndSchedules(t *testing.T) { _, _, err := cf.ForceUpdateStatus(stale) require.NotNil(t, err) - expectResumeEpochBump(t, backend, cfID, cf, 100) + expectResumeChangefeed(t, backend, cfID, cf, 100) require.NoError(t, controller.ResumeChangefeed(context.Background(), cfID, 100, false)) // The changefeed should be enqueued for scheduling and should not be blocked by the stale error. @@ -716,13 +717,10 @@ func TestResumeChangefeedUsesBackendReturnedInfo(t *testing.T) { require.NoError(t, err) backendInfo.SinkURI = "mysql://upstream:4000" backendInfo.State = config.StateNormal - backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + backend.EXPECT().ResumeChangefeed(gomock.Any(), cfID, gomock.Any(), uint64(100)). + DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, checkpointTs uint64) (*config.ChangeFeedInfo, error) { require.NotZero(t, candidateEpoch) - require.True(t, options.UpdateStatus) - require.True(t, options.UpdateError) - require.NotNil(t, options.State) - require.Equal(t, config.StateNormal, *options.State) + require.Equal(t, uint64(100), checkpointTs) backendInfo.Epoch = candidateEpoch return backendInfo, nil }).Times(1) @@ -732,7 +730,7 @@ func TestResumeChangefeedUsesBackendReturnedInfo(t *testing.T) { require.Equal(t, config.StateNormal, changefeedDB.GetByID(cfID).GetInfo().State) } -func expectResumeEpochBump( +func expectResumeChangefeed( t *testing.T, backend *mock_changefeed.MockBackend, cfID common.ChangeFeedID, @@ -741,17 +739,13 @@ func expectResumeEpochBump( ) { t.Helper() - backend.EXPECT().BumpChangefeedEpoch(gomock.Any(), cfID, gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, options changefeed.EpochBumpOptions) (*config.ChangeFeedInfo, error) { + backend.EXPECT().ResumeChangefeed(gomock.Any(), cfID, gomock.Any(), checkpointTs). + DoAndReturn(func(_ context.Context, _ common.ChangeFeedID, candidateEpoch uint64, gotCheckpointTs uint64) (*config.ChangeFeedInfo, error) { require.NotZero(t, candidateEpoch) - require.NotNil(t, options.State) - require.Equal(t, config.StateNormal, *options.State) - require.True(t, options.UpdateStatus) - require.Equal(t, checkpointTs, options.CheckpointTs) - require.Equal(t, config.ProgressNone, options.Progress) + require.Equal(t, checkpointTs, gotCheckpointTs) info, err := cf.GetInfo().Clone() require.NoError(t, err) - info.State = *options.State + info.State = config.StateNormal info.Epoch = candidateEpoch return info, nil }).Times(1) diff --git a/coordinator/coordinator_test.go b/coordinator/coordinator_test.go index 9566b2cb16..03e9022e17 100644 --- a/coordinator/coordinator_test.go +++ b/coordinator/coordinator_test.go @@ -412,7 +412,7 @@ func mockBumpChangefeedEpoch( if err != nil { return nil, err } - info.Epoch, err = pdutil.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) + info.Epoch, err = common.AdvanceChangefeedEpoch(candidateEpoch, info.Epoch) if err != nil { return nil, err } @@ -1033,40 +1033,6 @@ func TestHandleStateChangeBumpsEpochForWarningState(t *testing.T) { require.Equal(t, oldEpoch, req.MaintainerEpoch) } -func TestHandleStateChangeSkipsNilChangefeedInfo(t *testing.T) { - ctrl := gomock.NewController(t) - t.Cleanup(ctrl.Finish) - - backend := mock_changefeed.NewMockBackend(ctrl) - changefeedDB := changefeed.NewChangefeedDB(1216) - controller := &Controller{ - backend: backend, - changefeedDB: changefeedDB, - } - co := &coordinator{ - backend: backend, - controller: controller, - } - - cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) - cf := changefeed.NewChangefeed(cfID, &config.ChangeFeedInfo{ - ChangefeedID: cfID, - Config: config.GetDefaultReplicaConfig(), - State: config.StateNormal, - SinkURI: "mysql://127.0.0.1:3306", - }, 1, false) - changefeedDB.AddAbsentChangefeed(cf) - cf.SetInfo(nil) - - event := newChangefeedChange(cf, config.StateWarning, ChangeState, &config.RunningError{ - Time: time.Unix(1, 0), - Addr: "127.0.0.1:8300", - Code: "CDC:ErrSinkURIInvalid", - Message: "sink uri invalid", - }) - require.NoError(t, co.handleStateChange(context.Background(), event)) -} - func TestHandleStateChangePersistsRuntimeStateWhenStateChanges(t *testing.T) { ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) diff --git a/pkg/common/maintainer_epoch.go b/pkg/common/maintainer_epoch.go index 856ee9c801..4a244721e8 100644 --- a/pkg/common/maintainer_epoch.go +++ b/pkg/common/maintainer_epoch.go @@ -13,6 +13,8 @@ package common +import cerror "github.com/pingcap/ticdc/pkg/errors" + // MaintainerEpochMatches keeps rolling-upgrade compatibility while enforcing // exact owner epochs after upgraded maintainers report them. Epoch 0 means // either side predates the maintainer epoch field, so it stays accepted during @@ -21,3 +23,16 @@ package common func MaintainerEpochMatches(reportedEpoch, currentEpoch uint64) bool { return reportedEpoch == 0 || currentEpoch == 0 || reportedEpoch == currentEpoch } + +// AdvanceChangefeedEpoch returns max(candidate, current+1). +func AdvanceChangefeedEpoch(candidate, current uint64) (uint64, error) { + if candidate > current { + return candidate, nil + } + if current == ^uint64(0) { + // This guard is defensive. Normal PD TSO based epochs should never reach + // MaxUint64, but wrapping here would let a stale owner look newer. + return 0, cerror.ErrSchedulerRequestFailed.GenWithStackByArgs("changefeed epoch overflow") + } + return current + 1, nil +} diff --git a/pkg/common/maintainer_epoch_test.go b/pkg/common/maintainer_epoch_test.go new file mode 100644 index 0000000000..f04372ff45 --- /dev/null +++ b/pkg/common/maintainer_epoch_test.go @@ -0,0 +1,36 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package common + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestAdvanceChangefeedEpoch(t *testing.T) { + t.Parallel() + + epoch, err := AdvanceChangefeedEpoch(10, 8) + require.NoError(t, err) + require.Equal(t, uint64(10), epoch) + + epoch, err = AdvanceChangefeedEpoch(10, 12) + require.NoError(t, err) + require.Equal(t, uint64(13), epoch) + + _, err = AdvanceChangefeedEpoch(10, ^uint64(0)) + require.Error(t, err) + require.ErrorContains(t, err, "changefeed epoch overflow") +} diff --git a/pkg/pdutil/utils.go b/pkg/pdutil/utils.go index 42a43f4757..ce4e952fa5 100644 --- a/pkg/pdutil/utils.go +++ b/pkg/pdutil/utils.go @@ -66,14 +66,3 @@ func GenerateChangefeedEpoch(ctx context.Context, pdClient pd.Client) uint64 { } return oracle.ComposeTS(phyTs, logical) } - -// AdvanceChangefeedEpoch returns max(candidate, current+1). -func AdvanceChangefeedEpoch(candidate, current uint64) (uint64, error) { - if candidate > current { - return candidate, nil - } - if current == ^uint64(0) { - return 0, cerror.ErrSchedulerRequestFailed.GenWithStackByArgs("changefeed epoch overflow") - } - return current + 1, nil -} diff --git a/pkg/pdutil/utils_test.go b/pkg/pdutil/utils_test.go index addde9e6a6..235466bd0d 100644 --- a/pkg/pdutil/utils_test.go +++ b/pkg/pdutil/utils_test.go @@ -45,19 +45,3 @@ func TestGetSourceID(t *testing.T) { return sourceID == 2 }, 5*time.Second, 100*time.Millisecond) } - -func TestAdvanceChangefeedEpoch(t *testing.T) { - t.Parallel() - - epoch, err := AdvanceChangefeedEpoch(10, 8) - require.NoError(t, err) - require.Equal(t, uint64(10), epoch) - - epoch, err = AdvanceChangefeedEpoch(10, 12) - require.NoError(t, err) - require.Equal(t, uint64(13), epoch) - - _, err = AdvanceChangefeedEpoch(10, ^uint64(0)) - require.Error(t, err) - require.ErrorContains(t, err, "changefeed epoch overflow") -} From 06a67887e9652721140286ee29ae59ef9dc8d153 Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Tue, 23 Jun 2026 14:35:43 +0800 Subject: [PATCH 4/5] coordinator: refresh changefeed backend mock --- coordinator/changefeed/mock/changefeed_db_backend.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coordinator/changefeed/mock/changefeed_db_backend.go b/coordinator/changefeed/mock/changefeed_db_backend.go index b6f9131f56..f41fce976a 100644 --- a/coordinator/changefeed/mock/changefeed_db_backend.go +++ b/coordinator/changefeed/mock/changefeed_db_backend.go @@ -125,7 +125,7 @@ func (mr *MockBackendMockRecorder) PauseChangefeed(ctx, id interface{}) *gomock. } // ResumeChangefeed mocks base method. -func (m *MockBackend) ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, candidateEpoch uint64, checkpointTs uint64) (*config.ChangeFeedInfo, error) { +func (m *MockBackend) ResumeChangefeed(ctx context.Context, id common.ChangeFeedID, candidateEpoch, checkpointTs uint64) (*config.ChangeFeedInfo, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "ResumeChangefeed", ctx, id, candidateEpoch, checkpointTs) ret0, _ := ret[0].(*config.ChangeFeedInfo) From cd9e0b5e934a27bd40ddf471e96b0e27dc99e654 Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Tue, 23 Jun 2026 15:51:30 +0800 Subject: [PATCH 5/5] pkg: use canonical errors import --- pkg/common/maintainer_epoch.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/common/maintainer_epoch.go b/pkg/common/maintainer_epoch.go index 4a244721e8..35f856b927 100644 --- a/pkg/common/maintainer_epoch.go +++ b/pkg/common/maintainer_epoch.go @@ -13,7 +13,7 @@ package common -import cerror "github.com/pingcap/ticdc/pkg/errors" +import "github.com/pingcap/ticdc/pkg/errors" // MaintainerEpochMatches keeps rolling-upgrade compatibility while enforcing // exact owner epochs after upgraded maintainers report them. Epoch 0 means @@ -32,7 +32,7 @@ func AdvanceChangefeedEpoch(candidate, current uint64) (uint64, error) { if current == ^uint64(0) { // This guard is defensive. Normal PD TSO based epochs should never reach // MaxUint64, but wrapping here would let a stale owner look newer. - return 0, cerror.ErrSchedulerRequestFailed.GenWithStackByArgs("changefeed epoch overflow") + return 0, errors.ErrSchedulerRequestFailed.GenWithStackByArgs("changefeed epoch overflow") } return current + 1, nil }