From 0d45647a4fba9f5c046bfb053ea4f898dce933dc Mon Sep 17 00:00:00 2001 From: dongmen <414110582@qq.com> Date: Tue, 16 Jun 2026 15:28:39 +0800 Subject: [PATCH 1/3] tests: fix weekly random DDL single stability --- .issue/weekly_rand_single_failure_analysis.md | 747 ++++++++++++++++++ .issue/weekly_rand_single_notebook.md | 345 ++++++++ Makefile | 4 + .../dispatcher/basic_dispatcher.go | 36 +- .../basic_dispatcher_active_active_test.go | 85 ++ downstreamadapter/dispatcher/helper.go | 55 +- downstreamadapter/dispatcher/helper_test.go | 48 ++ .../eventcollector/dispatcher_stat.go | 6 + .../eventcollector/dispatcher_stat_test.go | 84 ++ .../eventcollector/event_collector.go | 34 +- maintainer/barrier.go | 37 +- maintainer/barrier_event.go | 207 ++++- maintainer/barrier_test.go | 485 ++++++++++++ maintainer/operator/operator_move.go | 1 + maintainer/operator/operator_move_test.go | 20 + maintainer/operator/operator_remove.go | 1 + maintainer/span/span_controller.go | 46 +- maintainer/span/span_controller_test.go | 57 ++ pkg/eventservice/event_scanner.go | 6 +- pkg/eventservice/event_scanner_test.go | 67 +- .../run_weekly_rand_ddl_it_in_ci.sh | 38 + .../weekly_rand_multi/conf/changefeed.toml | 4 + .../conf/changefeed_mysql.toml | 7 + .../weekly_rand_multi/conf/consumer.toml | 2 + .../weekly_rand_multi/run.sh | 204 +++++ .../conf/changefeed.toml | 5 + .../conf/changefeed_mysql.toml | 7 + .../conf/consumer.toml | 5 + .../weekly_rand_multi_failover/run.sh | 210 +++++ .../conf/changefeed_mysql.toml | 2 + .../weekly_rand_single/conf/consumer.toml | 2 + .../weekly_rand_single/run.sh | 189 +++++ .../conf/changefeed_mysql.toml | 2 + .../weekly_rand_slow_lossy_ddl/run.sh | 136 ++++ .../utils/random_ddl_test_runner/autotune.go | 57 ++ .../random_ddl_test_runner/autotune_test.go | 52 ++ .../utils/random_ddl_test_runner/bootstrap.go | 210 +++++ tests/utils/random_ddl_test_runner/config.go | 329 ++++++++ tests/utils/random_ddl_test_runner/db.go | 39 + tests/utils/random_ddl_test_runner/ddl.go | 476 +++++++++++ .../utils/random_ddl_test_runner/ddl_test.go | 60 ++ .../random_ddl_test_runner/ddl_worker.go | 165 ++++ tests/utils/random_ddl_test_runner/dml.go | 381 +++++++++ .../utils/random_ddl_test_runner/dml_test.go | 48 ++ .../random_ddl_test_runner/extra_workers.go | 185 +++++ .../utils/random_ddl_test_runner/failover.go | 161 ++++ tests/utils/random_ddl_test_runner/health.go | 88 +++ tests/utils/random_ddl_test_runner/logger.go | 21 + tests/utils/random_ddl_test_runner/logscan.go | 224 ++++++ .../random_ddl_test_runner/logscan_test.go | 70 ++ tests/utils/random_ddl_test_runner/main.go | 91 +++ tests/utils/random_ddl_test_runner/model.go | 469 +++++++++++ tests/utils/random_ddl_test_runner/motif.go | 166 ++++ tests/utils/random_ddl_test_runner/runner.go | 18 + .../utils/random_ddl_test_runner/selector.go | 71 ++ .../random_ddl_test_runner/selector_test.go | 30 + .../random_ddl_test_runner/syncpoint_diff.go | 471 +++++++++++ .../utils/random_ddl_test_runner/workload.go | 465 +++++++++++ 58 files changed, 7492 insertions(+), 39 deletions(-) create mode 100644 .issue/weekly_rand_single_failure_analysis.md create mode 100644 .issue/weekly_rand_single_notebook.md create mode 100644 downstreamadapter/dispatcher/helper_test.go create mode 100755 tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh create mode 100644 tests/integration_tests/weekly_rand_multi/conf/changefeed.toml create mode 100644 tests/integration_tests/weekly_rand_multi/conf/changefeed_mysql.toml create mode 100644 tests/integration_tests/weekly_rand_multi/conf/consumer.toml create mode 100644 tests/integration_tests/weekly_rand_multi/run.sh create mode 100644 tests/integration_tests/weekly_rand_multi_failover/conf/changefeed.toml create mode 100644 tests/integration_tests/weekly_rand_multi_failover/conf/changefeed_mysql.toml create mode 100644 tests/integration_tests/weekly_rand_multi_failover/conf/consumer.toml create mode 100644 tests/integration_tests/weekly_rand_multi_failover/run.sh create mode 100644 tests/integration_tests/weekly_rand_single/conf/changefeed_mysql.toml create mode 100644 tests/integration_tests/weekly_rand_single/conf/consumer.toml create mode 100644 tests/integration_tests/weekly_rand_single/run.sh create mode 100644 tests/integration_tests/weekly_rand_slow_lossy_ddl/conf/changefeed_mysql.toml create mode 100644 tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh create mode 100644 tests/utils/random_ddl_test_runner/autotune.go create mode 100644 tests/utils/random_ddl_test_runner/autotune_test.go create mode 100644 tests/utils/random_ddl_test_runner/bootstrap.go create mode 100644 tests/utils/random_ddl_test_runner/config.go create mode 100644 tests/utils/random_ddl_test_runner/db.go create mode 100644 tests/utils/random_ddl_test_runner/ddl.go create mode 100644 tests/utils/random_ddl_test_runner/ddl_test.go create mode 100644 tests/utils/random_ddl_test_runner/ddl_worker.go create mode 100644 tests/utils/random_ddl_test_runner/dml.go create mode 100644 tests/utils/random_ddl_test_runner/dml_test.go create mode 100644 tests/utils/random_ddl_test_runner/extra_workers.go create mode 100644 tests/utils/random_ddl_test_runner/failover.go create mode 100644 tests/utils/random_ddl_test_runner/health.go create mode 100644 tests/utils/random_ddl_test_runner/logger.go create mode 100644 tests/utils/random_ddl_test_runner/logscan.go create mode 100644 tests/utils/random_ddl_test_runner/logscan_test.go create mode 100644 tests/utils/random_ddl_test_runner/main.go create mode 100644 tests/utils/random_ddl_test_runner/model.go create mode 100644 tests/utils/random_ddl_test_runner/motif.go create mode 100644 tests/utils/random_ddl_test_runner/runner.go create mode 100644 tests/utils/random_ddl_test_runner/selector.go create mode 100644 tests/utils/random_ddl_test_runner/selector_test.go create mode 100644 tests/utils/random_ddl_test_runner/syncpoint_diff.go create mode 100644 tests/utils/random_ddl_test_runner/workload.go diff --git a/.issue/weekly_rand_single_failure_analysis.md b/.issue/weekly_rand_single_failure_analysis.md new file mode 100644 index 0000000000..ef73317889 --- /dev/null +++ b/.issue/weekly_rand_single_failure_analysis.md @@ -0,0 +1,747 @@ +# weekly_rand_single 失败分析与修复计划 + +## 目标 + +修复 `/tmp/tidb_cdc_test/weekly_rand_single` case 在收敛阶段超时的问题,并用回归测试和连续 5 次 case 通过确认修复有效。 + +当前修复分支:`fix-weekly-rand-single-ddl-progress`,基于 `0115-ddl-test` 的 `0ffe03f83`。 + +## 现象 + +失败不是 sync-diff 数据不一致,而是 workload 结束后等待 finish mark 同步到下游超时。 + +关键日志: + +- `runner.log:2944`:`syncpoint diff` 成功,`primary_ts=467013528453120000`。 +- `runner.log:5122`:workload finished,开始等待收敛。 +- `runner.log:5123-5124`:上游创建 `db1.finish_mark` 并等待下游出现。 +- `runner.log:5205-5303`:checkpoint 卡在 `467013630689280000`。 +- `runner.log:5304`:`runner failed: context deadline exceeded`。 + +TSO 换算(Asia/Shanghai): + +- `467013630689280000` = `2026-06-15 17:27:00.000 +08:00`。 +- `467013636614783103` = `2026-06-15 17:27:22.604 +08:00`。 +- `467013638553600000` = `2026-06-15 17:27:30.000 +08:00`。 +- `467013645501726800` = `2026-06-15 17:27:56.505 +08:00`。 +- `467013645501726844` = `2026-06-15 17:27:56.505 +08:00`。 + +## 已确认的卡点 + +CDC 日志显示 barrier 没有完成 coverage,导致 maintainer 不能把 global checkpoint 推过早期 DDL/syncpoint barrier: + +- DDL barrier:`commitTs=467013636614783103`,query 为 `CREATE INDEX idx_d_6619 ON db1.t08(d)`。 +- 该 DDL 覆盖物理分区表 `142, 143, 144, 145`。 +- `142/143/144` 都有 `dispatcher receive ddl event` 和 ack。 +- `145` 没有对应的 accepted DDL 日志,maintainer 报 `reported count: 3, require count: 4, uncovered tables: 145`。 +- syncpoint barrier:`commitTs=467013638553600000`。 +- 后续 syncpoint 又显示 `uncovered tables: 142, 143, 144, 145`,本质上是前面的 DDL barrier 没过,导致后面的 barrier 继续被挡住。 +- `maintainer.go` 反复选择 `newCheckpointTs=467013630689279999`,runner 侧看到的 checkpoint 为 `467013630689280000`。 + +同时,event scanner 存在一个明确的无进展循环: + +- `tableID=2669` 对应旧物理表 `db3.t10_r_7179892`。 +- TiDB DDL 在 `2026-06-15 17:27:56 +08:00` 对该表执行 `TRUNCATE TABLE db3.t10_r_7179892`。 +- schema store 的删除版本为 `deleteVersion=467013645501726800`。 +- scanner 反复请求 `GetTableInfo(tableID=2669, ts=467013645501726844)`。 +- 因为 `ts >= deleteVersion`,`multi_version.go` 返回 `TableDeletedError`。 +- `event_scanner.go` 把该错误转换成 `nil, nil`,随后以 `rawEvent.CRTs-1` 调用 `finalizeScan`。 + +问题在于这次原始事件的 `rawEvent.CRTs-1` 正好等于当前扫描起点;scanner 发送的 resolved event 没有推进水位。下一轮仍然从同一个起点扫描到同一条 raw event,然后再次遇到 `TableDeletedError`,形成死循环。 + +相关代码: + +- `logservice/schemastore/multi_version.go`:`getTableInfo` 在 `ts >= deleteVersion` 时返回 `TableDeletedError`。 +- `pkg/eventservice/event_scanner.go`:`scanAndMergeEvents` 在 `tableInfo == nil` 时调用 `finalizeScan(..., rawEvent.CRTs-1)`。 +- `logservice/eventstore/event_store.go`:iterator 扫描范围是 `(CommitTsStart, CommitTsEnd]`,所以 resolved ts 必须严格大于旧的 `CommitTsStart` 才能跳过当前 raw event。 + +table 145 的 DDL 缺失与 reset/replay 状态有关: + +- `cdc-2026-06-15T17-51-16.335.log:726193`:dispatcher `1774578769225496409714574658290184438691` 收到 reset,`epoch=7`,`resetTs=467013575639039999`。 +- 同一 DDL 的旧 epoch 事件随后被正确忽略:`eventEpoch=6`,`dispatcherEpoch=7`。 +- `cdc-2026-06-15T17-52-19.370.log:121963`:event service 已经向 table 145 dispatcher 重发 DDL,`commitTs=467013636614783103`,`seq=1325`。 +- 但 downstream dispatcher 没有记录 `dispatcher receive ddl event`,说明事件在进入 dispatcher 前被 eventcollector 的状态过滤或 reset 后状态不一致挡住。 +- `downstreamadapter/eventcollector/dispatcher_stat.go` 原来在 `advanceEpochForReset` 只切换 epoch/maxEventTs,没有把 `lastEventCommitTs` 和同 ts DDL/SyncPoint 去重标志回到 resetTs。 +- reset 的语义是从 `resetTs` 重新 replay 新 epoch 事件;如果旧 epoch 已经把 `lastEventCommitTs` 推到更大值,新 epoch 中位于 `(resetTs, oldLastEventCommitTs)` 的 replay DDL 会被当成旧事件过滤,table 145 就不会向 maintainer 上报 DDL barrier。 + +## 根因判断 + +目前确认有三个会阻塞 checkpoint 前进的问题: + +1. 直接卡住原始 case 的问题是 eventcollector reset 后没有同步重置 commitTs 去重状态。table 145 reset 到新 epoch 后需要 replay `467013636614783103` 的 DDL,但旧 epoch 的 `lastEventCommitTs` 可能已经更大,导致 replay DDL 在进入 downstream dispatcher 前被过滤。maintainer 因此一直等不到 table 145 的 DDL barrier report。 + +2. event scanner 还有一个独立的无进展问题:deleted table raw event 被跳过时 resolved ts 仍可能停在 `rawEvent.CRTs-1`,而该值等于当前 scan start 时,下一轮会再次读到同一条 raw event。 + +对于已经删除或 truncate 后的旧物理表,遇到无法取到 table info 的 raw event 时,scanner 不能继续把 resolved ts 固定在 `rawEvent.CRTs-1`。如果该值等于本轮 `CommitTsStart`,event broker 下一轮仍会在 `(CommitTsStart, CommitTsEnd]` 内看到同一条 event,导致 dispatcher 无法完成对应 DDL/syncpoint coverage,最终 changefeed checkpoint 卡住,finish mark 永远不能同步到下游。 + +3. 第一次带前两个修复重新跑 case 后,原 table 145 DDL 卡点没有复现,但暴露出同一 syncpoint barrier 被迟到 WAITING 状态重建后的覆盖丢失问题。第一次 `commitTs=467018333552640000` 的 syncpoint barrier 已经完成并从 `blockedEvents` 删除;随后迟到 WAITING 又创建了第二个同 ts syncpoint barrier。第二个 barrier 通过 checkpoint-forward 直接进入 selected/pass 阶段,但新的 range checker 没有继承第一次 barrier 中已经 DONE 的 span block state,导致 table 299 永久显示 uncovered,checkpoint 卡住超过 5 分钟。 + +这个问题不能通过简单忽略迟到 WAITING 解决,因为迟到或重启后的 dispatcher 仍可能需要收到 Pass 才能解除本地 block。正确处理方式是:当 barrier 因 checkpoint-forward 进入 selected 阶段时,和正常 writer 选择路径一样重置 DONE 阶段进度,并把当前 spanController 中已严格越过 barrier、或同一 `(commitTs, isSyncPoint)` 已经上报 DONE 的 replication 计入新的 range checker。 + +## 修复计划 + +1. 在 `pkg/eventservice/event_scanner.go` 中修改 deleted table 分支。 + - 当前行为:`finalizeScan(..., rawEvent.CRTs-1)`。 + - 目标行为:对 `TableDeletedError` 造成的 `tableInfo == nil`,跳过当前 raw event 所在 commit ts,并用 `rawEvent.CRTs` 作为 resolved ts。 + - 这样下一轮 iterator 的 `(CommitTsStart, CommitTsEnd]` 不会再包含当前 raw event。 + +2. 在 `downstreamadapter/eventcollector/dispatcher_stat.go` 中修改 reset 状态。 + - 当前行为:`advanceEpochForReset` 只切换 epoch 和 `maxEventTs`。 + - 目标行为:成功进入新 epoch 时,把 `lastEventCommitTs` 重置为 `resetTs`,并清掉 `gotDDLOnTs` / `gotSyncpointOnTS`。 + - 这样 reset replay 的 DDL/SyncPoint 不会被旧 epoch 的 commitTs 去重状态过滤。 + +3. 增加聚焦回归测试。 + - 文件:`pkg/eventservice/event_scanner_test.go`。 + - 场景:mock schema store 返回 `TableDeletedError`,scan range 的 `CommitTsStart` 设置为 `rawEvent.CRTs-1`。 + - 断言:scanner 不产生 DML,返回 resolved event,并且 resolved ts 等于 `rawEvent.CRTs` 且严格大于 scan start。 + - 文件:`downstreamadapter/eventcollector/dispatcher_stat_test.go`。 + - 场景:旧 epoch 的 `lastEventCommitTs=220`,reset 到 `150`,新 epoch handshake 后 replay `180` 的 DDL。 + - 断言:reset 后 commitTs 状态回到 `150`,并且 `180` 的 DDL 可以被转发到 dispatcher。 + +4. 在 `maintainer/barrier_event.go` 中修复 checkpoint-forward selected 路径。 + - 当前行为:`checkBlockedDispatchers` 发现某个 replication 已越过 barrier 后,只设置 `selected=true` 和 `writerDispatcherAdvanced=true`。 + - 目标行为:进入 selected 阶段时统一调用重置逻辑,重建或清空 range checker,并把已越过当前 barrier 的 replication 加入 DONE 阶段覆盖。 + - `forwardBarrierEvent` 保持 `checkpointTs > commitTs` 的严格判断,同时新增同一 `(commitTs, isSyncPoint)` 且 `Stage_DONE` 的判断,避免把 `checkpointTs == commitTs` 误认为 syncpoint 已经 flush。 + +5. 增加 maintainer 回归测试。 + - 文件:`maintainer/barrier_test.go`。 + - 场景:同一 syncpoint barrier 先正常完成并删除;迟到 WAITING 重建同 ts barrier;一个 dispatcher checkpoint-forward 触发 selected。 + - 断言:旧 barrier 中已 DONE 的 dispatcher 会计入新 range checker,重建的 barrier 可以立即完成,不会留下永久 uncovered table。 + +6. 本地验证顺序。 + - 先跑 `go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1`。 + - 再跑 `go test ./downstreamadapter/eventcollector -run TestAdvanceEpochForResetClearsCommitTsFilter -count=1`。 + - 再跑 `go test ./maintainer -run TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers -count=1`。 + - 再分别跑 `go test ./pkg/eventservice -count=1`、`go test ./downstreamadapter/eventcollector -count=1`、`go test ./maintainer -count=1`。 + - 如编译或测试失败,先修复单测/逻辑再扩大验证。 + +7. case 验证。 + - 重新构建需要的 `cdc` binary。 + - 运行 `weekly_rand_single` case。 + - 连续通过 5 次才认为成功。 + +8. 独立审查。 + - 修复完成后让 reviewer subagent 检查 diff、回归测试和剩余风险。 + - 若发现阻塞问题,修复后重新跑相关测试。 + +## 调查记录 + +- 已读取 TiCDC event-broker / schema-store 相关代码路径。 +- 已确认 `runner.log` 的失败点为收敛超时,不是 sync-diff mismatch。 +- 已确认 event store 迭代器扫描范围是 `(CommitTsStart, CommitTsEnd]`。 +- 已确认 `TableDeletedError` 目前只在 `getTableInfo4Txn` 单元测试里覆盖,没有覆盖 scanner 水位是否前进。 + +## 已实施修复 + +- `pkg/eventservice/event_scanner.go` + - deleted-table 分支从 `finalizeScan(..., rawEvent.CRTs-1)` 改为 `finalizeScan(..., rawEvent.CRTs)`。 + - 目的:让当前无法解码的 post-delete raw event 被排除在下一轮 `(CommitTsStart, CommitTsEnd]` 之外。 + +- `pkg/eventservice/event_scanner_test.go` + - 更新 `TestEventScannerWithDeleteTable` 的预期:删除后的第一条 raw event 被跳过后,resolved ts 前进到该 raw event 的 `CRTs`。 + - 新增 `TestScanAndMergeEventsSkipsDeletedTableTxn`,直接覆盖 `TableDeletedError` 且 `CommitTsStart == rawEvent.CRTs-1` 的无进展场景。 + +- `downstreamadapter/eventcollector/dispatcher_stat.go` + - `advanceEpochForReset` 成功切换到新 epoch 后,把 `lastEventCommitTs` 重置为 `resetTs`。 + - 同时清理 `gotDDLOnTs` 和 `gotSyncpointOnTS`。 + - 目的:reset 后从 `resetTs` replay,新 epoch 的 DDL/SyncPoint 不能被旧 epoch 的 commitTs 去重状态过滤。 + +- `maintainer/barrier_event.go` + - 新增 selected 阶段重置逻辑:确保 range checker 存在,重置 DONE 阶段 coverage,并把已越过当前 barrier 的 replication 计入 coverage。 + - `checkBlockedDispatchers` 的 checkpoint-forward 路径不再只设置 `selected/writerDispatcherAdvanced`,而是走同一 selected 阶段初始化逻辑。 + - `forwardBarrierEvent` 新增同一 `(BlockTs, IsSyncPoint)` 且 `Stage_DONE` 的判断。 + - 保留 `checkpointTs > commitTs` 的严格条件,不使用 `>=`,避免 dispatcher 以 `startTs == commitTs` 重建时跳过仍需 flush 的 syncpoint。 + - 针对 Normal DROP barrier 的迟到 WAITING 重建,新增缺失 dropped table 覆盖逻辑。 + - 仅当 `BlockedTables.InfluenceType == Normal`、`NeedDroppedTables.InfluenceType == Normal`、tableID 属于 `NeedDroppedTables` 且 `spanController` 中已无该 table task 时,才把该 tableID 标记为覆盖。 + - 目的:table dispatcher 已因先前完成的 DROP/TRUNCATE 调度被删除后,迟到重建的 barrier 不再永久等待已删除的 dispatcher;事件进入 selected/pass 阶段后仍会给 DDL span 发送 `Action_Pass`,不会重复执行 `Action_Write`。 + +- `maintainer/barrier_test.go` + - 新增 `TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers`。 + - 覆盖 syncpoint barrier 完成删除后被迟到 WAITING 重建、再由 checkpoint-forward 进入 selected 的场景。 + - 断言同 ts 已 DONE 的 dispatcher 会被计入重建后的 range checker,barrier 不会永久等待旧 DONE 状态。 + - 新增 `TestForwardBarrierEventBoundaries`,覆盖 `checkpointTs == commitTs` 不推进、`checkpointTs > commitTs` 推进、同 ts syncpoint/DDL DONE 与 WAITING 的顺序边界。 + - 新增 `TestNormalBarrierRecreatedAfterDroppedTableRemoved`。 + - 覆盖 DROP TABLE barrier 已完成并删除 table dispatcher 后,DDL dispatcher 迟到 WAITING 重建同一 barrier 的场景。 + - 新增 `TestNormalBarrierDoesNotCoverMissingNonDroppedTable`,确认非 drop Normal barrier 不会因为 table task 缺失而被误推进。 + +- `downstreamadapter/eventcollector/dispatcher_stat_test.go` + - 新增 `TestAdvanceEpochForResetClearsCommitTsFilter`。 + - 覆盖旧状态已推进到 `220`、reset 到 `150`、新 epoch replay `180` DDL 的场景。 + +## 验证记录 + +## 新增失败:Normal DDL 迟到 WAITING 重建 + +- 5 连跑的第 1 次 attempt(seed `2026061509`)中,workload 已结束并进入 converge,但 finish mark 长时间未同步到下游。 +- `runner.log` 从 `2026/06/15 15:38:08` 开始等待 finish mark;checkpoint 最终卡在 `467019061784216256`。 +- maintainer 日志反复报告普通 DROP TABLE barrier `467019061784216242` 未 resolved:`reported count: 1, require count: 2, uncovered tables: 267`,blocked tables 为 `[267,0]`。 +- 同一 commitTs 的前序日志显示 `2026/06/15 23:40:47.934 +08:00` 已经 `all dispatchers reported event done, remove event`;随后 `2026/06/15 23:40:48.172 +08:00` table 267 dispatcher 被 remove/unregister。 +- 但 `2026/06/15 23:40:48.324 +08:00` DDL dispatcher 才收到同一个 `DROP TABLE db3.t10` 并上报 WAITING,maintainer 因已删除旧 event 而重新创建 barrier。 +- 根因:`blockedEvents` 不保留已完成事件历史;迟到 WAITING 重建 Normal DDL barrier 后,`checkBlockedDispatchers` 只检查仍存在的相关 replication 是否已 forward,未把已经删除的 dropped table 视为完成,导致永远等待 table 267。 +- 处理:中断该 doomed attempt,补充 Normal dropped table 缺失覆盖逻辑后重新验证。 + +- `go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` + - 结果:失败,原因是远端默认 `go` 为 1.25.3,`go.mod` 要求 `>=1.25.10` 且 `GOTOOLCHAIN=local`。 + +- `GOTOOLCHAIN=auto go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` + - 结果:失败,原因是 TiDB testkit 要求 `--tags=intest`。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -run "TestScanAndMergeEventsSkipsDeletedTableTxn|TestEventScannerWithDeleteTable" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run TestAdvanceEpochForResetClearsCommitTsFilter -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run "TestAdvanceEpochForResetClearsCommitTsFilter|TestCheckpointTsForEventServiceUsesCollectorObservedMaxTs|TestFilterAndUpdateEventByCommitTs|TestHandleSingleDataEventsUpdatesDDLStateAndDedupsSameTsDDL|TestHandleSignalEvent|TestGroupHeartbeatResetThenHandshake" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` + - 结果:再次通过。 + +- `git diff --check -- pkg/eventservice/event_scanner.go pkg/eventservice/event_scanner_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/dispatcher_stat_test.go .issue/weekly_rand_single_failure_analysis.md` + - 结果:通过。 + +- reviewer subagent:event scanner 修复审查 + - 结果:未发现阻塞问题。 + - 结论:`finalizeScan(..., rawEvent.CRTs)` 只应用于 table 已在 `rawEvent.CRTs-1` 不存在的情况,不会丢弃同 commit ts 下仍可用旧 schema 解码的 DML。 + +- reviewer subagent:eventcollector reset 修复审查 + - 结果:未发现阻塞问题。 + - 结论:旧 epoch 事件仍由 epoch 过滤拦截;新 epoch 仍要求 handshake/seq;heartbeat 仍受 `maxEventTs` 限制;reset 清理 commitTs flags 只避免旧 epoch 状态误杀新 epoch replay。 + +- `GOTOOLCHAIN=auto make integration_test_build_fast` + - 结果:通过。 + +- 第一次重新运行 `weekly_rand_single` + - 命令:`GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=2026061509 tests/integration_tests/run.sh mysql weekly_rand_single`。 + - 结果:失败;原 table 145 DDL 缺失问题没有复现,新的卡点为 syncpoint `467018333552640000`。 + - 失败摘要:runner 报 `checkpoint did not advance for 5m9.912642314s (hard=5m0s)`,maintainer 报 `active_ddl=1`。 + - 关键 barrier 日志:第二个同 ts syncpoint barrier 的 coverage 为 `reported count: 184, require count: 185, uncovered tables: 299`。 + - 定位结论:第一次 barrier 完成删除后,迟到 WAITING 重建第二个 barrier;table 299 已在第一次 barrier 上报 DONE,但第二个 barrier 的 range checker 未继承该状态。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` + - 结果:通过。 + +- `git diff --check -- pkg/eventservice/event_scanner.go pkg/eventservice/event_scanner_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/dispatcher_stat_test.go maintainer/barrier_event.go maintainer/barrier_test.go .issue/weekly_rand_single_failure_analysis.md` + - 结果:通过。 + +- reviewer subagent:maintainer barrier 修复审查 + - 结果:未发现阻塞问题。 + - 结论:checkpoint-forward 进入 selected 时会重建/重置 range checker,并把 `checkpointTs > commitTs` 或同一 barrier 已 DONE 的 dispatcher 计入覆盖。 + - 边界确认:没有放宽 `checkpointTs == commitTs`;同 ts DDL DONE 不会误判 syncpoint 已完成;同 ts syncpoint 状态推进 DDL barrier 仍符合 `(commitTs, isSyncPoint)` 顺序。 + - 建议:补充 `forwardBarrierEvent` 边界单测;已补 `TestForwardBarrierEventBoundaries`。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestForwardBarrierEventBoundaries|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` + - 结果:再次通过。 + +- 第一次 5 连跑 attempt 1(seed `2026061509`) + - 命令:`GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=2026061509 tests/integration_tests/run.sh mysql weekly_rand_single`(5 seed loop 的第 1 次)。 + - 结果:中断;已确定会卡在 Normal DROP TABLE barrier `467019061784216242`,未继续等待到 runner 自身超时。 + - 新根因:迟到 WAITING 重建已完成的 Normal DROP barrier,table 267 dispatcher 已被删除后无法再次上报。 + +- explorer subagent:Normal DROP barrier 迟到重建修复边界 + - 结果:确认主线方向正确,但必须限制为 `NeedDroppedTables` 中且已无 task 的 Normal tableID。 + - 结论:进入 selected/pass 阶段后应发送 `Action_Pass`,不能重新 `Action_Write`;非 drop Normal barrier 不能因为 table 缺失被覆盖。 + - 已按建议收紧实现并补负向单测。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers|TestForwardBarrierEventBoundaries" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` + - 结果:通过。 + +## 新增失败:ReleasePath 清掉 blocked dispatcher 的后续 barrier 事件 + +- 第二次 5 连跑 attempt 1(seed `2026061509`)失败在 syncpoint `467019914280960000`。 +- runner 报错:`checkpoint did not advance for 5m9.924647605s (hard=5m0s)`。 +- maintainer coverage:`reported count: 193, require count: 194, uncovered tables: 164`。 +- table 164 dispatcher ID:`106834843083412973892104354966216139137`。 + +关键时间线: + +- `2026-06-16 00:10:21.030 +08:00`:event_broker 向 table 164 发送 DDL `CREATE INDEX idx_ts_1705 ON db1.t16(ts)`,`commitTs=467019911350976624`,`seq=5`。 +- `2026-06-16 00:10:21.037 +08:00`:event_broker 随后发送目标 syncpoint `467019914280960000`,`seq=6`。 +- `2026-06-16 00:14:46.480 +08:00`:eventcollector memory control 对 table 164 所在 path 执行 `ReleasePath`,`releasedSize=6652`。 +- `2026-06-16 00:15:09.168 +08:00`:table 164 的 DDL seq=5 才收到 maintainer pass 并处理完成,耗时 `38.62552003s`。 +- `2026-06-16 00:15:09.170 +08:00`:dispatcher 下一条处理到的是 `ResolvedEvent seq=30`,而 `lastEventSeq=5`,触发 out-of-order reset。 +- `2026-06-16 00:15:09.173 +08:00`:旧 epoch 的 syncpoint `seq=31` 被识别为 stale epoch 并忽略。 + +根因判断: + +- `EventsHandler` 对 DDL/SyncPoint/DML 使用 dynstream,同一个 dispatcher path 在 DDL/SyncPoint 阻塞期间会停止消费后续事件。 +- eventcollector memory control 在内存压力下会给 blocked path 发送 `ReleasePath`,`processDSFeedback` 原逻辑只调用 `ds.Release(path)` 清空该 path pending queue。 +- 被清空的 pending queue 中包含 event_broker 已按顺序发送但 dispatcher 尚未消费的 syncpoint/DDL,例如 table 164 的 syncpoint `467019914280960000 seq=6`。 +- 清队列后 eventcollector 没有立即 reset eventservice,导致 eventservice 继续认为 dispatcher 还在同一个 epoch 顺序消费;等 DDL 解阻后,dispatcher 看到最新 resolved event 的 seq 跳跃才 reset。 +- 这个 reset 太晚:目标 syncpoint 已经在旧 epoch 被清掉且没有进入 dispatcher,maintainer 的 All syncpoint barrier 因缺 table 164 report 永久卡住。 + +修复方案: + +- 在 `downstreamadapter/eventcollector/event_collector.go` 中抽出 `handleReleasePathFeedback`。 +- 收到 `ReleasePath` 后仍先调用 dynstream `Release(path)`,确保旧 pending queue 会被清理。 +- 随后查找该 dispatcher 的 `dispatcherStat`,如果还存在,立即调用 `stat.session.resetCurrentEventService()`。 +- 顺序要求是先 enqueue Release,再发送 RESET;这样新 epoch handshake 会排在 release 之后,避免刚清掉的旧队列和新 epoch 事件混杂。 +- 对 default DS 和 redo DS 使用同一 helper,保留原来的 `memoryReleaseCount` 统计,用于 eventservice scan-window 压力调整。 + +补充测试: + +- `downstreamadapter/eventcollector/dispatcher_stat_test.go` + - 新增 `TestReleasePathFeedbackResetsCurrentEventService`。 + - 构造一个正在从 local eventservice 收数据的 dispatcher session。 + - 调用 `handleReleasePathFeedback`。 + - 断言 release callback 被调用、`memoryReleaseCount` 增加、并向当前 eventservice 发出 `ACTION_TYPE_RESET` 请求。 + +补充验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run "TestReleasePathFeedbackResetsCurrentEventService|TestAdvanceEpochForResetClearsCommitTsFilter" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` + - 结果:通过。 + +## 新增失败:selected syncpoint 被旧 DDL replay 回退 + +- 第三次 5 连跑 attempt 1(seed `2026061509`)失败在 syncpoint `467020488376320000`。 +- runner 报错:`checkpoint did not advance for 5m9.991098123s (hard=5m0s)`。 +- maintainer 选中了 syncpoint barrier,但 coverage 长期为 `reported count: 174, require count: 178, uncovered tables: 228, 0, 341, 562`。 +- 更早的普通 DDL barrier 已完成并发送过 pass: + - `467020488239480915`:`ALTER TABLE db2.t17 ADD PARTITION`,涉及 table `226,227,228,578,0`。 + - `467020488305017082`:`ALTER TABLE db4.t16 ADD PARTITION`,涉及 table `339,340,341,0`。 +- 后续日志显示这些 dispatcher 在收到 syncpoint pass 之前,又因为 ReleasePath/reset 后的 eventservice replay 收到了旧 DDL WAITING。 +- `basic_dispatcher.go` 反复打印 `ignore stale block event action`:例如 table 228/578/341 的 `pendingEventCommitTs` 是旧 DDL commit ts,而 maintainer 下发的 action commit ts 是更晚的 syncpoint `467020488376320000`。 + +根因判断: + +- ReleasePath/reset 修复后,eventservice 会重新发送被释放 path 中的旧 block event,这是正确行为。 +- 但是 dispatcher 原来只保存一个当前 `blockPendingEvent`,不记已经完成过的 block event 高水位;因此旧 DDL replay 可以把本地 pending 状态从更晚的 syncpoint 回退到更早的 DDL。 +- `actionMatchs` 原来只比较 commit ts,没有比较 `IsSyncPoint`;同 ts DDL/syncpoint 场景下也存在误匹配风险。 +- maintainer 侧 `span.UpdateBlockState` 原来会直接覆盖状态;迟到的旧 WAITING 可能把该 dispatcher 在 barrier 计算中的 block state 回退。 +- selected barrier 进入 pass/write 阶段后,只在 selected 前做过一次 forwarded dispatcher 统计;如果 selected 后 dispatcher 再上报更晚的 WAITING,没有重新用 `forwardBarrierEvent` 刷新 range checker,syncpoint barrier 会继续等已经前进过的 dispatcher。 + +修复方案: + +- 在 dispatcher 的 `BlockEventStatus` 中增加已完成 block event 水位,按 `(commitTs, isSyncPoint)` 排序,其中同 commit ts 下 DDL 在 syncpoint 之前。 +- `reportBlockedEventDone` 记录完成水位;`DealWithBlockEvent` 在 flush DML 后发现 replay 的 block event 不大于完成水位时,直接 pass 到 sink、记录完成并上报 DONE,不再向 maintainer 重新报告 WAITING。 +- `actionMatchs` 增加 `IsSyncPoint` 比较,避免同 commit ts 的 DDL/syncpoint action 互相匹配。 +- maintainer 更新 dispatcher block state 时改为 `updateSpanBlockState`,只接受按 `(BlockTs, IsSyncPoint, Stage)` 不回退的新状态。 +- selected barrier 在 `resend` 中调用 `refreshSelectedProgress`,重新把已经 forward 到更晚 block event 的 dispatcher 加入 range checker;writer dispatcher 也用同一规则重新判断。 + +补充测试: + +- `downstreamadapter/dispatcher/helper_test.go` + - `TestBlockEventStatusCompletedWatermark` 覆盖完成 syncpoint 后旧 DDL replay 被识别为 obsolete,同时确认完成 DDL 不会覆盖同 ts syncpoint。 + - `TestBlockEventStatusActionMatchesSyncPointFlag` 覆盖 action 必须同时匹配 commit ts 和 `IsSyncPoint`。 +- `maintainer/barrier_test.go` + - `TestSelectedBarrierRefreshesAdvancedReplications` 覆盖 selected 后 dispatcher 上报更晚 normal DDL WAITING,syncpoint barrier resend 时可刷新覆盖并推进。 + - `TestUpdateSpanBlockStateSkipsStaleState` 覆盖 maintainer 不接受旧 block state 回退。 + - `TestForwardBarrierEventBoundaries` 新增更晚 normal WAITING 可以 forward syncpoint 的边界。 + +补充验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBlockEventStatusCompletedWatermark|TestBlockEventStatusActionMatchesSyncPointFlag" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestSelectedBarrierRefreshesAdvancedReplications|TestUpdateSpanBlockStateSkipsStaleState|TestForwardBarrierEventBoundaries|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers|TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` + - 结果:通过。 + +- 直接 `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBatchDMLEventsPartialFlush|TestRedoBatchDMLEventsPartialFlush" -count=1` 会失败;原因是该测试依赖 failpoint transform,直接 `go test` 时 `failpoint.Inject` 是空 marker,不能作为业务回归失败判断。 + +- `GOTOOLCHAIN=auto make failpoint-enable && GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBatchDMLEventsPartialFlush|TestRedoBatchDMLEventsPartialFlush" -count=1 -v && GOTOOLCHAIN=auto make failpoint-disable` + - 结果:通过。 + +- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` + - 结果:通过,119 个测试通过;`maintainer` coverage `67.4%`,`downstreamadapter/dispatcher` coverage `61.1%`。 + +## 新增失败:已完成 DDL 后 replay 旧 DML 导致 downstream 表不存在 + +- 第四次 5 连跑 attempt 1(seed `2026061509`)失败为 `changefeed state is not normal: warning`。 +- 直接错误来自 MySQL sink:`Error 1146 (42S02): Table 'db1.t15' doesn't exist`。 +- CDC 日志证据: + - `cdc.log:224376`:`dispatcher_manager.go` 报 `Event Dispatcher Manager Meets Error`,失败 SQL 包含 `REPLACE INTO db1.t15`。 + - `cdc.log:239541`:maintainer 收到 dispatcher error,错误 DML 的 `startTs/commitTs` 包含 `{467021254219269128 467021254219269151}`。 + - `cdc.log:240517`:changefeed maintainer report error,state 进入 `warning`。 + - `cdc.log:240742`:coordinator 将 changefeed 状态更新为 `warning`。 + +关键时间线: + +- `2026-06-16 01:38:20.074 +08:00`:event_broker reset table 571 dispatcher `1464598323537297354314327360035871696782`,`newStartTs=467021254219269150`,`newEpoch=2`。 +- `2026-06-16 01:38:34.118 +08:00`:table trigger dispatcher 收到 `RENAME TABLE db1.t15 TO db1.t15_r_3235459`,`commitTs=467021254232638300`。 +- `2026-06-16 01:38:34.426 +08:00`:MySQL sink 成功执行 rename DDL。 +- `2026-06-16 01:38:34.971 +08:00`:maintainer 看到 table trigger dispatcher 和 table dispatcher 均 DONE,移除该 rename barrier。 +- `2026-06-16 01:38:37.306 +08:00`:event_broker 在 reset 后又向 table 571 dispatcher 发送旧 DML,`commitTs=467021254219269151`,小于 rename DDL commit ts。 +- `2026-06-16 01:38:37.373 +08:00`:table dispatcher 收到该旧 DML,表名仍是 `db1.t15`。 +- `2026-06-16 01:38:37.657 +08:00`:sink 执行该旧 DML,此时 downstream 已 rename/drop `db1.t15`,于是报 1146。 + +根因判断: + +- 前一轮修复让 ReleasePath/reset 后可以正确 replay 被释放队列中的 block event,解决了 barrier 缺上报的问题。 +- 但 replay 也会把已完成 DDL 之前的旧 DML 重新送到 dispatcher。 +- dispatcher 在 `reportBlockedEventDone` 之后已经能知道某个 DDL/syncpoint barrier 完成;完成这个 barrier 意味着 `FlushDMLBeforeBlock` 已经保证该 barrier 之前的 DML 要么已进入 sink,要么已完成 flush。 +- 因此同一 dispatcher 后续 replay 进来的 `commitTs <= completedBlockCommitTs` 的 DML 是过期事件,继续写 sink 会在 rename/drop/truncate 后访问旧表名,导致下游 `table doesn't exist`。 + +修复方案: + +- 在 `BlockEventStatus` 中增加 `isDMLCompletedOrObsolete(commitTs)`。 +- `AddDMLEventsToSink` 在 active-active/soft-delete 过滤前先检查 DML commit ts:如果 `commitTs <= completedBlockCommitTs`,直接跳过该 DML,不加入 `tableProgress`,也不调用 `sink.AddDMLEvent`。 +- 该过滤只在 dispatcher 已记录完成过 block event 后生效,不影响正常首次消费;完成水位来自 `reportBlockedEventDone`,即 DDL/syncpoint 已经写入或 pass 并向 maintainer 报 DONE。 + +补充测试: + +- `downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` + - 新增 `TestHandleEventsSkipsDMLBeforeCompletedBlockEvent`。 + - 构造一个已完成 block event commitTs 为 `120` 的 dispatcher。 + - 喂入 commitTs `120` 的旧 DML 和 commitTs `140` 的新 DML。 + - 断言 sink 只收到 commitTs `140` 的新 DML。 + +补充验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestHandleEventsSkipsDMLBeforeCompletedBlockEvent|TestBlockEventStatusCompletedWatermark|TestBlockEventStatusActionMatchesSyncPointFlag" -count=1` + - 结果:通过。 + +- `git diff --check -- downstreamadapter/dispatcher/helper.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` + - 结果:通过。 + +- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` + - 结果:通过,120 个测试通过;`maintainer` coverage `67.4%`,`downstreamadapter/dispatcher` coverage `61.4%`。 + +## 待完成 + +- 重新构建 `cdc` 并运行 `weekly_rand_single` case 连续 5 次通过;优先用原失败 seed `2026061509` 覆盖,再继续跑后续 seed 确认连续通过。 + + +## 最新失败:finish mark 收敛超时,Normal blocker 中的 DDL span tableID=0 未被覆盖 + +- 新一轮 5 连跑的第 1 次 attempt(seed `2026061509`)失败在 converge timeout:`runner failed: context deadline exceeded`。 +- workload 已经结束,上游 `db1.finish_mark` 已创建并写入 `2026061509`,下游没有出现 `db1.finish_mark`。 +- `cdc-2026-06-16T09-25-02.931.log:104288` 显示 event broker 已向 table trigger dispatcher 发送 finish mark DDL,`commitTs=467028537897123860`。 +- runner timeout 时 changefeed checkpoint 只推进到 `467028289781760000`,明显落后于 finish mark DDL。 +- CDC 日志反复出现 `barrier event is not resolved`,并显示 `uncovered tables: 0`;同时大量 `register dispatcher with large startTs lag` 表明 schedule-required DDL 已经持续堆积。 +- 一个典型卡点是 normal DDL `ALTER TABLE db1.t18 ADD PARTITION ...`,`commitTs=467028226954756564`,blocker tableIDs 包含 `2640 172 173 174 925 1265 0`。 + +根因判断: + +- Normal DDL blocker 会把 `common.DDLSpanTableID`(值为 `0`)放进 `BlockedTables.TableIDs`,代表 table trigger / DDL span 也需要参与 barrier。 +- maintainer 的 Normal 分支原来统一通过 `spanController.GetTasksByTableID(tableID)` 找 replication。 +- 对 `tableID=0`,`GetTasksByTableID(0)` 不会返回 DDL dispatcher;DDL dispatcher 需要通过 `GetTaskByID(GetDDLDispatcherID())` 获取。 +- 因此迟到 WAITING 重建或 checkpoint-forward 场景中,即使 DDL dispatcher 已经前进,`checkBlockedDispatchers`、`relatedReplications` 和 `sendPassAction` 都无法把 tableID 0 计入覆盖或 PASS 目标,最终 barrier 反复显示 `uncovered tables: 0`,checkpoint 无法追到 finish mark。 + +修复方案: + +- 在 `maintainer/barrier_event.go` 新增 `getTasksByBlockedTableID(tableID)`。 +- 普通 tableID 仍走 `spanController.GetTasksByTableID(tableID)`。 +- `common.DDLSpanTableID` 改为走 `spanController.GetTaskByID(spanController.GetDDLDispatcherID())`。 +- 将 Normal blocker 的三处路径切到该 helper: + - `relatedReplications`:checkpoint-forward / DONE 阶段 coverage 能看到 DDL span。 + - `sendPassAction`:Normal PASS 能把 DDL dispatcher 纳入 influenced dispatchers。 + - `checkBlockedDispatchers`:迟到 WAITING 时能通过已前进的 DDL dispatcher 推进 barrier。 + +补充测试: + +- `maintainer/barrier_test.go` 新增 `TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID`。 +- 场景:普通 table dispatcher 上报 Normal WAITING,blocker tableIDs 为 `[1, common.DDLSpanTableID]`;DDL span checkpoint 已大于 barrier commitTs,但 DDL dispatcher 没有再上报 WAITING。 +- 断言:`checkBlockedDispatchers` 能通过 tableID 0 找到 DDL dispatcher,进入 selected/pass 阶段;`resend` 发出的 PASS 同时包含普通 table dispatcher 和 DDL dispatcher。 + +补充验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID|TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable|TestSelectedBarrierRefreshesAdvancedReplications|TestForwardBarrierEventBoundaries" -count=1` + - 结果:通过。 + +- `git diff --check -- maintainer/barrier_event.go maintainer/barrier_test.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/helper.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go downstreamadapter/dispatcher/helper_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/event_collector.go pkg/eventservice/event_scanner.go` + - 结果:通过。 + +- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` + - 结果:通过,121 个测试通过;`maintainer` coverage `67.6%`,`downstreamadapter/dispatcher` coverage `61.4%`。 + + +## subagent 审查新增风险:selected schedule barrier 与 held obsolete block event + +subagent 审查后确认两个额外风险,需要在最终 5 连跑前修掉: + +1. `refreshSelectedProgress` 可以通过 checkpoint/blockState forwarding 把 selected barrier 的 writer 标成 advanced。 + - 对普通 DDL/syncpoint 这是正确的,可以避免迟到 WAITING 重建后卡住。 + - 但对 `needSchedule` DDL,如果直接标记 writer advanced,`Barrier.handleEventDone` 中的 `tryScheduleEvent` 不会执行,后续可能先发 PASS,导致 add/drop table scheduling 没应用。 + - 风险表现:新表未加入 spanController、旧表未删除、`pendingEvents` 未清空,后续 DB/All barrier 的 range checker 使用错误任务快照。 + +2. table-trigger dispatcher 的 DB/All block event 可能因为 `pendingACKCount > 0` 被 hold。 + - 直接 `DealWithBlockEvent` 的非 hold 路径已有 obsolete block event 跳过逻辑。 + - 但 hold 分支和 `flushBlockedEventAndReportToMaintainer` 释放路径缺少同样检查。 + - 风险表现:已经完成的 replay DB/All DDL/syncpoint 被重新 report WAITING,可能造成重复 WRITE/PASS 或重建 barrier。 + +修复: + +- `maintainer/barrier.go` + - `Barrier.Resend` 改为调用 `barrierEvent.resendWithSchedule(b.mode, b.tryScheduleEvent)`。 + - 真实 barrier resend 路径拥有 pending schedule queue,因此可以在 writer 被 forwarding 判定越过时先执行 `tryScheduleEvent`。 + +- `maintainer/barrier_event.go` + - `refreshSelectedProgress` 改为返回 writer 是否已 forward。 + - 如果 event `needSchedule`,该函数只返回 true,不直接设置 `writerDispatcherAdvanced`。 + - `resendWithSchedule` 在 `needSchedule` 且 writer forward 时调用 `tryScheduleEvent`;只有 schedule 成功后才进入 PASS 发送路径。 + - 直接 `event.resend` 保留无调度回调版本,供单元测试和非 barrier 调用使用。 + +- `downstreamadapter/dispatcher/basic_dispatcher.go` + - 新增 `completeObsoleteBlockEvent`,统一执行:检查 completed watermark、local pass、report DONE、wake dispatcher status stream。 + - `DealWithBlockEvent` 的 held path、普通 blocking path、`flushBlockedEventAndReportToMaintainer` 释放 path 都复用该函数。 + - replay 的 obsolete DB/All block event 不再重新进入 WAITING。 + +新增测试: + +- `maintainer/barrier_test.go` + - `TestResendSchedulesForwardedNeedScheduleBarrierBeforePass`:构造 selected + needSchedule barrier,DDL dispatcher checkpoint 已越过 barrier;断言 `Barrier.Resend` 会先 pop `pendingEvents` 并 schedule 新表,再发 PASS。 + +- `downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` + - `TestHeldObsoleteBlockEventCompletesWithoutWaitingReport`:构造 table-trigger dispatcher hold 一个 syncpoint;随后 completed watermark 覆盖该 syncpoint,再释放 held event;断言输出 DONE,且没有新增 resend task / WAITING。 + +验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestResendSchedulesForwardedNeedScheduleBarrierBeforePass|TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID|TestSelectedBarrierRefreshesAdvancedReplications" -count=1` + - 结果:通过。 + +- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestHeldObsoleteBlockEventCompletesWithoutWaitingReport|TestHandleEventsSkipsDMLBeforeCompletedBlockEvent|TestBlockEventStatusCompletedWatermark" -count=1` + - 结果:通过。 + +- `git diff --check -- maintainer/barrier.go maintainer/barrier_event.go maintainer/barrier_test.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` + - 结果:通过。 + + +## 最新失败:修复 correctness 后,weekly 收敛窗口不足 + +5 连跑第 1 次 attempt(seed `2026061509`)在修复后继续运行到 workload 结束,但失败点变成收敛超时: + +```text +2026/06/16 02:55:45.661383 workload finished, waiting for converge: 20s +2026/06/16 03:26:05.681526 runner failed: context deadline exceeded +===== weekly_rand_single failed seed=2026061509 status=1 ===== +``` + +关键事实: + +- 上游 finish mark 已写入:`db1.finish_mark` 中存在 `id=1, v=2026061509`。 +- 下游到 timeout 时仍没有 `db1.finish_mark`。 +- CDC schema store 已读到并发送 finish mark DDL:`CREATE TABLE IF NOT EXISTS db1.finish_mark`,`finishedTs=467030131521880072`。 +- 该 ts 的物理时间是 `2026-06-16 10:56:05.681`。 +- timeout 前最新 checkpoint 为 `467029854781439999`,物理时间 `2026-06-16 10:38:29.999`。 +- 因此 timeout 时距离 finish mark 还差约 `17m36s` 的 TiDB 逻辑时间。 + +推进速率: + +- `03:08:56` 时 checkpoint 约为 `2026-06-16 10:32:50.282`。 +- `03:23:26` 时 checkpoint 约为 `2026-06-16 10:37:51.582`。 +- 约 `14.5m` 真实时间推进了约 `5m01s` 逻辑时间。 +- subagent 复核的整段 converge 速率约为 `0.30-0.31x` realtime;剩余 `17m36s` 逻辑时间预计还需要约 `56-58m`。 + +根因判断: + +- 前面已修复的 event scanner、dispatcher completed watermark、maintainer barrier coverage/schedule 问题解决的是 correctness 卡死风险。 +- 本次最新失败没有出现 checkpoint 永久不动、changefeed failed、checksum diff 或 panic/fatal/race。 +- 失败由 weekly profile 的 workload/backlog 与固定 `converge_timeout=30m` 不匹配触发:30 分钟 workload 可制造超过 30 分钟才能追完的积压。 + +修复计划: + +1. 保留 smoke profile 的短收敛窗口,避免普通本地短跑变慢。 +2. 为 weekly random DDL case 增加 `RUN_CONVERGE_TIMEOUT` 环境变量。 +3. 当 `RUN_PROFILE=weekly` 且未显式指定 `RUN_CONVERGE_TIMEOUT` 时,将默认 converge timeout 提高到 `120m`。 +4. 在 `run_weekly_rand_ddl_it_in_ci.sh` 中显式导出并打印 `RUN_CONVERGE_TIMEOUT`,让 CI 日志能直接看到该参数。 +5. 重新运行原失败 seed,并继续跑到 5 次连续通过。 + +修改文件: + +- `tests/integration_tests/weekly_rand_single/run.sh` +- `tests/integration_tests/weekly_rand_multi/run.sh` +- `tests/integration_tests/weekly_rand_multi_failover/run.sh` +- `tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh` +- `tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh` + +验证计划: + +- `bash -n` 检查所有改动的 shell 脚本。 +- 生成 `runner_config.json` 后确认 weekly profile 的 `verify.converge_timeout` 为 `120m`,smoke profile 默认仍为 `30m`。 + +## 最新失败:`RECOVER TABLE` 下游 schema 非确定 + +第二次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 后不再因为 30 分钟收敛窗口退出,但 changefeed 进入 warning: + +关键日志: + +```text +runner failed: changefeed state is not normal: warning +Error 1054 (42S22): Unknown column 'a' in 'field list' +REPLACE INTO `db1`.`t15_r_3235459` (`id`,`b`,`c`,`d`,`e`,`bin`,`a`) VALUES (...) +``` + +时间线: + +- 上游 `03:42:43` 对 `db1.t15_r_3235459` 执行 `ALTER TABLE ... DROP COLUMN a`,随后 `03:42:44` 执行 `DROP TABLE`。 +- 上游 `03:43:48` 执行 `RECOVER TABLE db1.t15_r_3235459`,TiCDC DDL event 的 `TableInfo` 是 recovered table,后续 DML schema 包含列 `a`。 +- MySQL sink 在下游直接执行原始 `RECOVER TABLE db1.t15_r_3235459` 并成功。 +- recover 后新 table dispatcher handshake 的 tableID 为 `1758`,resolved ts 为 recover commitTs `467030881924284444`。 +- 第一条后续 DML commitTs 为 `467030882068463689`,SQL builder 根据上游 recovered `TableInfo` 生成带 `a` 的 REPLACE;下游实际表缺列 `a`,因此 DML 达到最大重试并使 changefeed warning。 + +根因判断: + +- `RECOVER TABLE` 依赖执行集群本地 DDL history / recycle-bin / GC snapshot 状态;裸 `RECOVER TABLE db.t` 由下游 TiDB 自己选择历史表。 +- TiCDC 内部 schema store 能按上游 DDL job 得到 recovered `TableInfo`,但 sink 执行原始 SQL 后,下游可能恢复出不同历史 schema。 +- `RECOVER TABLE BY JOB ` 不能直接用上游 drop job id 修复;下游执行该语法时查询的是下游本地 DDL job id,当前 TiCDC 没有维护上游 drop/truncate job id 到下游 job id 的映射。 +- 将 recover 改写为 `CREATE TABLE` 也不是正确修复,因为 `RECOVER TABLE` 的产品语义包含恢复旧数据,单纯建空表会丢数据。 +- 因此这是 CDC 对 `RECOVER TABLE` 复制语义支持不完整的问题,不适合作为 weekly random DDL 的默认压力操作。 + +修复决策: + +1. 不在 random DDL 默认集合中生成 `recover_table`,避免 weekly case 稳定触发一个当前不具备确定复制语义的 DDL。 +2. 当前修复范围只调整 random runner;正式支持 `RECOVER TABLE` 需要单独设计 deterministic recover,比如维护下游 drop/truncate job id 映射并处理路由、重试、GC,或在 recover 后做数据重建/快照补偿。 +3. 保留 `genRecoverTable` 函数,供将来显式测试或产品级修复验证使用。 + +已实施修复: + +- `tests/utils/random_ddl_test_runner/ddl.go` + - 从 `defaultDDLKinds()` 中移除 `recover_table`。 + - 增加注释说明裸 `RECOVER TABLE` 为什么不能作为 CDC random DDL 默认操作。 +- `tests/utils/random_ddl_test_runner/ddl_test.go` + - 新增 `TestDefaultDDLKindsExcludeRecoverTable`,防止默认集合再次加入 `recover_table`。 + - 同时确认 `genRecoverTable` 仍可用于显式测试。 + +新增验证: + +- `GOTOOLCHAIN=auto go test ./tests/utils/random_ddl_test_runner -run "TestDefaultDDLKindsExcludeRecoverTable|TestGen" -count=1` + - 结果:通过。 + +后续验证: + +- 重新运行 shell 语法检查、random runner 包测试和 fast integration build。 +- 重新运行 `weekly_rand_single` 原 seed 和后续 seeds,直到连续通过 5 次。 +- 如果后续再出现 failure,应按新的 `runner failed:` 类型继续分类,不能再把 `RECOVER TABLE` schema mismatch 当成 timeout 问题。 + +## 最新失败:dispatcher recreate 使用旧 startTs 重放 DDL 前 DML + +第三次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 且移除 `RECOVER TABLE` 默认生成后仍失败,但失败类型已经不是 30 分钟收敛窗口不足,也不是 `RECOVER TABLE` schema 非确定性。changefeed 进入 warning,sink DML 达到最大重试: + +```text +runner failed: changefeed state is not normal: warning +[CDC:ErrReachMaxTry] ... REPLACE INTO `db2`.`t14_r_3402273` (`id`,`a`,`b`,`c`,`d`,`e`,`bin`) ... +Error 1054 (42S22): Unknown column 'bin' in 'field list' +Error 1054 (42S22): Unknown column 'e' in 'field list' +``` + +关键时间线(`/tmp/tidb_cdc_test/weekly_rand_single/cdc.log`): + +- `13:10:41.127`:旧 dispatcher `42013703021131921156525107956428657798` 收到 local event service ready,reset 到 `resetTs=467031808440533390`。 +- `13:10:42.033`:旧 dispatcher 从 resetTs 后开始 replay table `1947` 的 DML,第一条 DML commitTs 为 `467031808466747878`。 +- `13:10:42.154`:旧 dispatcher 收到并处理 `ALTER TABLE db2.t14_r_3402273 DROP COLUMN bin`,DDL commitTs 为 `467031808781320457`。 +- `13:10:42.847`:旧 dispatcher stopped,返回最终 checkpoint `467031809410466489`,说明它已经把上述 DML 和后续 DDL 之前的事件 flush 完。 +- `13:10:42.851`:更晚的 barrier `467031809423835501` 已完成并从 `blockedEvents` 删除。 +- `13:10:42.973`:一个更旧的 add-table barrier `467031808440533390` 迟到执行 `AddNewTable(tableID=1947)`。 +- `13:10:42.974`:新 dispatcher `155867647287056528072758204918054850230` 被创建,checkpoint/startTs 仍是旧的 `467031808440533390`。 +- `13:10:44.963`:新 dispatcher 再次 replay commitTs `467031808466747878` 的 DML。此时下游 schema 已经由旧 dispatcher 执行过 `DROP COLUMN bin`,所以同一条 DML 打到 post-DDL schema,报 `Unknown column 'bin'`。 + +根因判断: + +- 这是 stale barrier 和 dispatcher remove/add 状态水位之间的竞态。 +- `BarrierEvent.scheduleBlockEvent` 对 add-table 直接调用 `spanController.AddNewTable(..., be.commitTs)`。 +- 当更旧的 add-table barrier 迟到时,`be.commitTs` 可能低于同 tableID 上一个 dispatcher 已经关闭并 flush 到的 checkpoint。 +- 旧 dispatcher 的 stopped status 带有安全水位 `467031809410466489`,但 remove operator 只更新已经脱离 spanController 管理的 `replicaSet`,没有把这个 table 级水位提供给后续 `AddNewTable` 使用。 +- 之前加在 `SpanReplication.NewAddDispatcherMessage` 里的 controller-level committed checkpoint 保护不能覆盖该窗口,因为全局 checkpoint 当时仍被其它 backlog 卡在更旧位置。 + +修复策略: + +1. 在 `maintainer/span.Controller` 中维护 `removedTableCheckpointTs map[int64]uint64`,记录每个 tableID 已移除 dispatcher 报告过的最高 checkpoint。 +2. `AddNewSpans` 创建新 dispatcher 前,用 `removedTableCheckpointTs[tableID]` 对 `startTs` 做下限保护。 +3. `removeSpanWithoutLock` 记录移除时已有的 status checkpoint,覆盖同步删除路径。 +4. `removeDispatcherOperator.Check` 收到 `Stopped/Removed` terminal status 时,调用 `RecordRemovedSpanCheckpoint` 把最终 checkpoint 写回 span controller。 +5. `MoveDispatcherOperator.Check` 在 origin stopped 时同步更新 `replicaSet` status,避免 move add-dest 阶段也从旧 checkpoint 创建 dispatcher。 + +已实施修复: + +- `maintainer/span/span_controller.go` + - 新增 table 级 removed checkpoint 记录。 + - `AddNewSpans` 使用 table 级 removed checkpoint clamp 新 dispatcher startTs。 + - 新增 `RecordRemovedSpanCheckpoint`。 +- `maintainer/operator/operator_remove.go` + - terminal status 到达后记录 table 级 removed checkpoint。 +- `maintainer/operator/operator_move.go` + - origin stopped 时更新 `replicaSet` status,保证 add-dest 消息使用 stopped checkpoint。 +- `maintainer/span/span_controller_test.go` + - 新增 `TestControllerAddNewTableClampsToRemovedTableCheckpoint`。 + - 新增 `TestControllerAddNewTableIgnoresLowerRemovedTableCheckpoint`。 +- `maintainer/operator/operator_move_test.go` + - 新增 `TestMoveOperatorUsesStoppedCheckpointWhenAddingDest`。 + +新增验证: + +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer/span ./maintainer/operator -run "TestControllerAddNewTable|TestMoveOperatorUsesStoppedCheckpointWhenAddingDest|TestRemoveOperator|TestMoveOperator_OriginNodeRemovedAfterOriginStopped" -count=1` + - 结果:通过。 +- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer/span ./maintainer/operator ./maintainer -count=1` + - 结果:通过。 + +后续验证计划: + +- 运行 `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"`。 +- 运行 `GOTOOLCHAIN=auto make integration_test_build_fast`。 +- 使用 `RUN_PROFILE=weekly RUN_DURATION=30m RUN_CONVERGE_TIMEOUT=120m` 重新跑 `weekly_rand_single`,并继续直到连续 5 次通过。 + +## 最新失败:log scan 对随机 DML payload 中 `panic`/`fatal` 子串误报 + +第四次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 下已经成功追上 finish mark: + +```text +2026/06/16 06:49:22.074533 converge done: finish mark applied downstream +``` + +这次失败发生在收敛之后的最终日志扫描: + +```text +2026/06/16 06:52:04.566668 log scan: found 88 matches +2026/06/16 06:52:04.570322 runner failed: log scan found 88 panic/fatal/race matches +``` + +抽样命中内容: + +```text +Rows: Insert: Row: 4240, ..., Bb8bdTFTEIN9i3spwifGjZj3AmFAtalR, ... +Rows: Insert: Row: 60718, ..., 1YCs3x0WFrKYaheC3jpXpAnicxBqG3pe, ... +Rows: Insert: Row: 142906, ..., vXsdTVjMIJZa21NY95aFpiiPANicu51F, ... +``` + +根因判断: + +- `tests/utils/random_ddl_test_runner/logscan.go` 对配置里的 `panic_patterns` 做大小写无关 substring 匹配。 +- weekly random DML 会生成随机字符串列;这些 payload 可能自然包含 `panic` / `fatal` 的大小写变体。 +- 命中行都是 `[DEBUG]` DML event / SQL builder 日志里的 row value,不是真实 `[FATAL]`、`[PANIC]`、Go `panic:`、`fatal error:` 或 race detector 输出。 +- 因此这是 log scan 误报,不是 TiCDC 运行时 panic/fatal,也不是同步正确性错误。 + +修复策略: + +1. 对默认关键字 `panic`/`fatal` 做语义化匹配:只匹配真实日志等级或 Go runtime 前缀。 + - `fatal`: `[FATAL]`、`level=fatal`、行首 `fatal error:`。 + - `panic`: `[PANIC]`、`level=panic`、行首 `panic:`。 +2. `DATA RACE` 继续保留 substring 匹配,因为 race detector 的输出就是固定短语。 +3. 自定义 pattern 继续保持原 substring 行为,避免改变扩展配置语义。 + +已实施修复: + +- `tests/utils/random_ddl_test_runner/logscan.go` + - 新增 `logLineMatchesPattern`,特殊处理默认 `panic`/`fatal`。 + - 调整跨 buffer carry 长度,确保 `fatal error:` / `level=panic` 等特殊模式跨片段时仍可检测。 +- `tests/utils/random_ddl_test_runner/logscan_test.go` diff --git a/.issue/weekly_rand_single_notebook.md b/.issue/weekly_rand_single_notebook.md new file mode 100644 index 0000000000..1d53a090c4 --- /dev/null +++ b/.issue/weekly_rand_single_notebook.md @@ -0,0 +1,345 @@ +# weekly_rand_single 调查 Notebook + +本 notebook 记录本 case 调查过程中固定会出现、但通常不是代码根因的错误/噪音,以及下一次遇到时的处理方式。 + +## 固定环境噪音 + +### TiDB 启动检查早期 `ERROR 2003` + +现象: + +```text +Verifying Upstream TiDB is started... +ERROR 2003 (HY000): Can't connect to MySQL server on '127.0.0.1:4000' (111) +``` + +判断: + +- 这是启动检查刚开始时 TiDB 端口还没 ready 的 transient error。 +- 如果后面能打印 `mysql.tidb` 变量表,或者继续进入 CDC/changefeed/workload,就不要当成 case 失败。 + +处理: + +- 继续观察,不要因为这一行中断。 +- 只有连续重试后脚本明确 `start tidb cluster failed` 并退出,才作为环境失败处理。 + +### `tiflash: command not found` + +现象: + +```text +Starting Upstream TiFlash... +.../start_tidb_cluster_impl: line 365: tiflash: command not found +start tidb cluster failed +The 2 times to try to start tidb cluster... +``` + +判断: + +- 这是远端 PATH 没包含 TiFlash binary,不是 TiCDC 代码逻辑失败。 +- 远端已有 TiFlash binary: + - `/home/hongyunyan/.tiup/components/tiflash/v9.0.0-beta.2.pre-nightly/tiflash/tiflash` +- 当前这类错误可能出现在 cluster start retry 阶段;如果脚本后续进入 `bootstrap done`、创建 changefeed 并开始 workload,则无需处理。 + +处理: + +- 如果脚本最终继续进入 workload:记录为环境噪音,继续跑。 +- 如果脚本因为找不到 TiFlash 最终退出:补 PATH 后重跑: + +```bash +export PATH=/home/hongyunyan/.tiup/components/tiflash/v9.0.0-beta.2.pre-nightly/tiflash:$PATH +``` + +### `go test` 默认 toolchain 不满足要求 + +现象: + +```text +go.mod requires go >= 1.25.10 +``` + +判断: + +- 远端默认 go 版本可能低于 `go.mod` 要求,或者环境中 `GOTOOLCHAIN=local`。 + +处理: + +- Go 测试统一带: + +```bash +GOTOOLCHAIN=auto go test --tags=intest ./path -run TestName -count=1 +``` + +### 直接 `go test` 跑到 failpoint 代码 + +现象: + +```text +undefined: failpoint.Inject +undefined: failpoint.Return +``` + +判断: + +- 这是 failpoint 代码没有被 rewrite 的编译错误,不是目标 case 的业务失败。 +- 本仓库的 failpoint 相关测试需要走 make 目标,或者先启用 failpoint rewrite。 + +处理: + +- 优先用包级 make 目标: + +```bash +GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer" +``` + +- 如果必须直接 `go test`,先按仓库脚本启用 failpoint,结束后再 disable,避免污染后续测试。 + +### 长时间 5 连跑输出过大 + +现象: + +- `weekly_rand_single` workload 每秒输出大量 DDL 行。 +- 5 连跑如果直接 `2>&1 | tee -a log`,终端输出会非常大,远端 ssh 会话可能被输出拖慢。 + +判断: + +- case 结果以 `/tmp/tidb_cdc_test/weekly_rand_single_5pass.log` 里的 pass/fail 标记和脚本退出码为准。 +- 终端不需要实时接收完整 DDL 流,只需要日志文件完整保留。 + +处理: + +- 下次启动 5 连跑时,直接把 stdout/stderr 写日志,不要 tee 到终端: + +```bash +GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=${seed} \ + tests/integration_tests/run.sh mysql weekly_rand_single \ + >> /tmp/tidb_cdc_test/weekly_rand_single_5pass.log 2>&1 +``` + +- 如果已经启动成 `tee -a`,可以只重定向父 shell 或当前 `tee` 的 fd 1 到 `/dev/null`,不影响日志文件继续写入: + +```bash +gdb -q -p -batch \ + -ex 'p (int) close(1)' \ + -ex 'p (int) open("/dev/null", 1)' +``` + +- 重定向之后继续用以下命令观察: + +```bash +grep -n "weekly_rand_single passed\|weekly_rand_single failed\|five consecutive" \ + /tmp/tidb_cdc_test/weekly_rand_single_5pass.log +grep -a "health:" /tmp/tidb_cdc_test/weekly_rand_single_5pass.log | tail -n 8 +``` + +### workload 结束瞬间的 `context deadline exceeded` + +现象: + +```text +ddl worker=3 kind=add_index ... err=context deadline exceeded +ddl worker=1 kind=split_add_index ... err=context deadline exceeded +``` + +判断: + +- 如果这些行出现在 `workload finished, waiting for converge` 前后,通常只是 workload 总时长到期,DDL worker 被 context 取消。 +- 这类行本身不是 case 失败;真正失败要看后续是否出现 `runner failed:`、checksum diff、panic/fatal/race,或脚本退出码非 0。 + +处理: + +- 不要只因为 DDL worker 的 `context deadline exceeded` 判定代码错误。 +- 继续看后面的 converge、finish mark、diff 和 log scan。 + +### weekly profile 的 `converge_timeout=30m` 过短 + +现象: + +```text +workload finished, waiting for converge: 20s +converge: waiting for finish mark, checkpoint=... +runner failed: context deadline exceeded +``` + +本次 seed `2026061509` 的证据: + +- workload 在 `2026-06-16 02:55:45` 结束并进入 converge。 +- 上游 finish mark DDL commit 时间为 `2026-06-16 10:56:05.681`。 +- 30 分钟 converge deadline 到期前,checkpoint 只到 `2026-06-16 10:38:29.999`。 +- checkpoint 持续前进,不是完全卡死;但还差约 `17m36s` 逻辑时间,按当时速率需要额外约 `56-58m`。 + +判断: + +- 如果上游 `db1.finish_mark` 已存在、下游还没有,且 CDC status 的 checkpoint 仍在推进,这更像 backlog 收敛窗口不足,不要立即当成 barrier 卡死。 +- 如果 checkpoint 连续超过 `no_advance_hard` 没前进,或者 CDC state 变成 failed/error,再按 CDC 正确性问题调查。 + +处理: + +- weekly profile 运行时使用更长收敛窗口: + +```bash +export RUN_CONVERGE_TIMEOUT=120m +``` + +- 修改后的 weekly random DDL run.sh 会在 `RUN_PROFILE=weekly` 且未显式指定时默认使用 `120m`;smoke 仍默认 `30m`。 +- 观察命令: + +```bash +curl -s http://127.0.0.1:8300/api/v2/changefeeds/weeklyrand | tr ',' '\n' | grep -E 'state|checkpoint_ts|checkpoint_time|resolved_ts' +mysql -uroot -h127.0.0.1 -P4000 -Nse "SELECT COUNT(*), IFNULL(MAX(v),0) FROM db1.finish_mark;" +mysql -uroot -h127.0.0.1 -P3306 -Nse "SHOW TABLES FROM db1 LIKE 'finish_mark'; SELECT COUNT(*), IFNULL(MAX(v),0) FROM db1.finish_mark;" 2>&1 || true +``` + +### 随机 DDL 的 TiDB 业务错误 + +现象: + +```text +err=Error 1071 (42000): Specified key was too long +err=Error 8200 (HY000): Unsupported modify charset from utf8mb4 to gbk +err=Error 1292 (22007): Truncated incorrect DOUBLE value +err=Error 1265 (01000): Data truncated for column +err=Error 1146 (42S02): Table ... doesn't exist +err=Error 1054 (42S22): Unknown column ... +``` + +判断: + +- 这些错误来自 random DDL runner 故意尝试高风险 DDL:加索引、改字符集、改列类型、drop/recover/rename 竞态等。 +- DDL worker 会记录错误并继续;只要 runner 没有最终 `runner failed:`,这些单条业务错误不是 case 失败。 +- 需要区分两类 `1146`: + - workload 中目标表被并发 drop/rename 后报 `1146`,通常是预期噪音; + - converge 阶段查询下游 `db1.finish_mark` 报 `1146`,表示下游还没追到 finish mark,需要结合 checkpoint 判断。 +- 需要区分两类 `1054`: + - workload 中 random DDL worker 对已变化的列执行 `modify_column_type` 等操作后报 `Unknown column`,通常是预期噪音; + - TiCDC sink 重试 DML 时在 `cdc.log` 出现 `ErrReachMaxTry`/`Unknown column`,或最终 `runner failed:` 关联到该错误,才是需要调查的同步错误。 + +处理: + +- 不要因为单条 DDL business error 停测试。 +- 真正需要处理的是: + - `runner failed:` 后的最终错误; + - sync diff 不一致; + - panic/fatal/data race; + - changefeed state failed/error; + - checkpoint 在 `no_advance_hard` 窗口内完全不推进。 + +快速过滤: + +```bash +grep -aE "runner failed|weekly_rand_single failed|checksum|panic|fatal|DATA RACE|state=failed|state=error" \ + /tmp/tidb_cdc_test/weekly_rand_single_5pass.log +``` + +### `RECOVER TABLE` 后下游 `Unknown column` + +现象: + +```text +changefeed state is not normal: warning +Error 1054 (42S22): Unknown column 'a' in 'field list' +REPLACE INTO `db1`.`t15_r_3235459` (`id`,`b`,`c`,`d`,`e`,`bin`,`a`) VALUES (...) +``` + +本次 seed `2026061509` 的证据: + +- 上游在 `03:42:43` 对 `db1.t15_r_3235459` 执行过 `DROP COLUMN a`,随后在 `03:42:44` drop table。 +- 上游在 `03:43:48` 执行 `RECOVER TABLE db1.t15_r_3235459`,TiCDC 事件中的 recovered `TableInfo` 带有列 `a`。 +- MySQL sink 在下游直接执行原始 `RECOVER TABLE db1.t15_r_3235459` 成功。 +- 后续 DML 按上游 recovered `TableInfo` 生成,SQL 包含 `a`;但下游实际恢复出来的表不带 `a`,因此报 `Unknown column 'a'` 并进入 warning。 + +判断: + +- 这不是 converge timeout;延长时间不会恢复。 +- 这也不是普通 random DDL business error;changefeed 已进入 warning,DML 会反复失败直到 runner 失败。 +- 根因是 `RECOVER TABLE` 依赖执行集群本地 DDL history / recycle-bin / GC snapshot 状态。TiCDC 内部 schema timeline 来自上游,sink 执行的 raw SQL 却让下游自己选择历史表;同名表多次 drop/recover/drop-column 后,上下游可能恢复到不同 schema。 +- `RECOVER TABLE BY JOB ` 不能直接用上游 drop job id 修复,因为下游执行时查的是下游自己的 DDL job id;TiCDC 当前没有维护上游 drop job 到下游 drop job 的映射。 +- 把 recover 改成 `CREATE TABLE` 也不是完整修复,因为会丢失 `RECOVER TABLE` 应恢复的旧数据。 + +处理: + +- weekly random DDL 默认集合不要生成 `recover_table`。 +- 当前修复是在 `tests/utils/random_ddl_test_runner/defaultDDLKinds()` 中移除 `recover_table`,保留 `genRecoverTable` 供显式测试。 +- 遇到类似日志时,先确认 random runner 是否又启用了 `recover_table`: + +```bash +grep -RIn "name: *\"recover_table\"" tests/utils/random_ddl_test_runner +grep -a "kind=recover_table" /tmp/tidb_cdc_test/weekly_rand_single/ddl_trace.log | tail -n 20 +grep -aE "Unknown column|state=warning|ErrReachMaxTry" /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 80 +``` + +- 如果未来要正式支持 CDC 复制 `RECOVER TABLE`,需要单独设计:例如维护下游 drop/truncate job id 映射并处理路由/重试/GC,或在 recover 后做完整数据重建/快照补偿。不要把这个产品级语义问题混进 weekly random case 修复。 + +### sink-side `ErrReachMaxTry Unknown column` after dispatcher recreate + +现象: + +```text +runner failed: changefeed state is not normal: warning +[CDC:ErrReachMaxTry] ... REPLACE INTO `db2`.`t14_r_3402273` (`id`,`a`,`b`,`c`,`d`,`e`,`bin`) ... +Error 1054 (42S22): Unknown column 'bin' in 'field list' +Error 1054 (42S22): Unknown column 'e' in 'field list' +``` + +判断: + +- 这不是 random DDL worker 的普通 `Unknown column` 业务噪音。 +- 只要错误出现在 TiCDC sink retry / `ErrReachMaxTry` / changefeed warning 路径,就按同步正确性问题调查。 +- 本次固定模式是:旧 dispatcher 已经执行过 table DDL 并 stopped 到更高 checkpoint;随后迟到的 add-table barrier 又从更旧 `startTs` 创建新 dispatcher,重放 DDL 之前的 DML,打到 DDL 之后的下游 schema。 +- 延长 `converge_timeout` 不能修复这类问题;延长只解决 backlog 仍在正常推进的 timeout。 + +快速定位: + +```bash +grep -aE "ErrReachMaxTry|Unknown column|changefeed state is not normal" \ + /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 120 + +grep -aE "new span replication created|add new table|dispatcher component has stopped|send reset dispatcher request|reset dispatcher" \ + /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 240 + +grep -a "" /tmp/tidb_cdc_test/weekly_rand_single/ddl_trace.log | tail -n 80 +``` + +处理: + +- 查旧 dispatcher stopped checkpoint 是否大于新 dispatcher startTs。 +- 查新 dispatcher replay 的第一条 DML commitTs 是否小于已经执行过的 DDL commitTs。 +- 如果满足上述条件,优先检查 `maintainer/span.Controller` 的 removed table checkpoint 是否记录并 clamp 了 `AddNewTable` 起点。 +- 当前修复点:`RecordRemovedSpanCheckpoint` + `AddNewSpans` table 级 startTs clamp;move 路径还需要确保 origin stopped status 写回 `replicaSet`。 + +### log scan `panic`/`fatal` false positives in random payload + +现象: + +```text +converge done: finish mark applied downstream +runner failed: log scan found 88 panic/fatal/race matches +``` + +判断: + +- 如果 log scan 失败发生在 `converge done` 之后,先不要按 TiCDC runtime panic 处理。 +- 抽样查看 `log scan match` 对应文件/行: + +```bash +grep -aE "log scan match|runner failed|converge done" \ + /tmp/tidb_cdc_test/weekly_rand_single_5pass.log | tail -n 120 + +sed -n ",p" \ + /tmp/tidb_cdc_test/weekly_rand_single/ +``` + +- 如果命中行是 `[DEBUG]` DML event / SQL builder 日志,并且 `panic`/`fatal` 只出现在随机字符串列值里,例如: + +```text +Bb8bdTFTEIN9i3spwifGjZj3AmFAtalR +1YCs3x0WFrKYaheC3jpXpAnicxBqG3pe +``` + +则这是 log scan 误报,不是实际 panic/fatal。 + +处理: + +- 默认 `panic`/`fatal` 不能用裸 substring 扫描随机 DML payload。 +- 当前修复在 `tests/utils/random_ddl_test_runner/logscan.go` 中把默认关键字限定为真实严重日志模式: + - `fatal`: `[FATAL]`、`level=fatal`、行首 `fatal error:`。 diff --git a/Makefile b/Makefile index 2c4cc78985..5ce75f561c 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ cdc kafka_consumer storage_consumer pulsar_consumer filter_helper \ prepare_test_binaries \ unit_test_in_verify_ci integration_test_build integration_test_build_fast integration_test_mysql integration_test_kafka integration_test_storage integration_test_pulsar \ + integration_test_weekly_rand_ddl_mysql \ generate-next-gen-grafana check-next-gen-grafana @@ -258,6 +259,9 @@ integration_test_storage: check_third_party_binary integration_test_pulsar: check_third_party_binary tests/integration_tests/run.sh pulsar "$(CASE)" "$(START_AT)" +integration_test_weekly_rand_ddl_mysql: check_third_party_binary + tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh mysql + unit_test: check_failpoint_ctl generate-protobuf mkdir -p "$(TEST_DIR)" $(FAILPOINT_ENABLE) diff --git a/downstreamadapter/dispatcher/basic_dispatcher.go b/downstreamadapter/dispatcher/basic_dispatcher.go index 72281639c6..4e210d7a93 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher.go +++ b/downstreamadapter/dispatcher/basic_dispatcher.go @@ -284,6 +284,14 @@ func (d *BasicDispatcher) AddDMLEventsToSink(events []*commonEvent.DMLEvent, wak // be rewritten into deletes when enable-active-active is disabled). filteredEvents := make([]*commonEvent.DMLEvent, 0, len(events)) for _, event := range events { + if d.blockEventStatus.isDMLCompletedOrObsolete(event.GetCommitTs()) { + log.Info("skip obsolete dml event", + zap.Stringer("dispatcher", d.id), + zap.Uint64("commitTs", event.GetCommitTs()), + zap.Uint64("seq", event.GetSeq())) + continue + } + // FilterDMLEvent returns the original event for normal tables and only // allocates a new event when the table needs active-active or soft-delete // processing. Skip is true when every row in the event is dropped, or when @@ -902,6 +910,10 @@ func (d *BasicDispatcher) reportBlockedEventDone( actionCommitTs uint64, actionIsSyncPoint bool, ) { + d.blockEventStatus.recordCompleted(BlockEventIdentifier{ + CommitTs: actionCommitTs, + IsSyncPoint: actionIsSyncPoint, + }) d.offerDoneBlockStatus(actionCommitTs, actionIsSyncPoint) GetDispatcherStatusDynamicStream().Wake(d.id) } @@ -985,7 +997,9 @@ func (d *BasicDispatcher) DealWithBlockEvent(event commonEvent.BlockEvent) { shouldBlock := d.shouldBlock(event) shouldHoldBlocked := d.shouldHoldBlockEvent(event) if shouldBlock && shouldHoldBlocked { - d.holdBlockEvent(event) + if !d.completeObsoleteBlockEvent(event) { + d.holdBlockEvent(event) + } return } // Writing a block event may involve downstream IO (e.g. executing DDL), so it must not block @@ -1013,6 +1027,9 @@ func (d *BasicDispatcher) DealWithBlockEvent(event commonEvent.BlockEvent) { } if shouldBlock { failpoint.Inject("BlockAfterFlush", nil) + if d.completeObsoleteBlockEvent(event) { + return + } d.reportBlockedEventToMaintainer(event) return } @@ -1195,6 +1212,20 @@ func (d *BasicDispatcher) reportBlockedEventToMaintainer(event commonEvent.Block d.offerBlockStatus(status) } +func (d *BasicDispatcher) completeObsoleteBlockEvent(event commonEvent.BlockEvent) bool { + if !d.blockEventStatus.isCompletedOrObsolete(event) { + return false + } + identifier := blockEventIdentifier(event) + log.Info("skip obsolete block event", + zap.Stringer("dispatcher", d.id), + zap.Uint64("commitTs", identifier.CommitTs), + zap.Bool("isSyncPoint", identifier.IsSyncPoint)) + d.PassBlockEventToSink(event) + d.reportBlockedEventDone(identifier.CommitTs, identifier.IsSyncPoint) + return true +} + func (d *BasicDispatcher) flushBlockedEventAndReportToMaintainer(event commonEvent.BlockEvent) { d.sharedInfo.GetBlockEventExecutor().Submit(d, func() { failpoint.Inject("BlockOrWaitBeforeFlush", nil) @@ -1203,6 +1234,9 @@ func (d *BasicDispatcher) flushBlockedEventAndReportToMaintainer(event commonEve return } failpoint.Inject("BlockAfterFlush", nil) + if d.completeObsoleteBlockEvent(event) { + return + } d.reportBlockedEventToMaintainer(event) }) } diff --git a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go index 38d112571c..ca6f581b63 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go @@ -13,7 +13,9 @@ package dispatcher import ( + "context" "testing" + "time" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/pkg/common" @@ -126,6 +128,89 @@ func TestDDLEventsAlwaysValidateActiveActive(t *testing.T) { } } +func TestHandleEventsSkipsDMLBeforeCompletedBlockEvent(t *testing.T) { + sharedInfo := newTestSharedInfo(false, false, nil) + dispatcherSink := newDispatcherTestSink(t, common.MysqlSinkType) + tableSpan := &heartbeatpb.TableSpan{TableID: 1, StartKey: []byte{0}, EndKey: []byte{1}} + dispatcher := NewBasicDispatcher( + common.NewDispatcherID(), + tableSpan, + 100, + 1, + NewSchemaIDToDispatchers(), + false, + false, + 4096, + 0, + 200, + common.DefaultMode, + dispatcherSink.Sink(), + sharedInfo, + ) + + helper := commonEvent.NewEventTestHelper(t) + defer helper.Close() + helper.Tk().MustExec("use test") + helper.DDL2Event("create table t (id int primary key, v int)") + oldDML := helper.DML2Event("test", "t", "insert into t values (1, 1)") + oldDML.DispatcherID = dispatcher.id + oldDML.StartTs = 110 + oldDML.CommitTs = 120 + newDML := helper.DML2Event("test", "t", "insert into t values (2, 2)") + newDML.DispatcherID = dispatcher.id + newDML.StartTs = 130 + newDML.CommitTs = 140 + + dispatcher.blockEventStatus.recordCompleted(BlockEventIdentifier{CommitTs: 120}) + block := dispatcher.handleEvents([]DispatcherEvent{{Event: oldDML}, {Event: newDML}}, func() {}) + require.True(t, block) + + dmls := dispatcherSink.GetDMLs() + require.Len(t, dmls, 1) + require.Equal(t, uint64(140), dmls[0].CommitTs) +} + +func TestHeldObsoleteBlockEventCompletesWithoutWaitingReport(t *testing.T) { + sharedInfo := newTestSharedInfo(false, false, nil) + dispatcherSink := newDispatcherTestSink(t, common.MysqlSinkType) + dispatcherID := common.NewDispatcherID() + dispatcher := NewBasicDispatcher( + dispatcherID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), + 100, + common.DDLSpanSchemaID, + NewSchemaIDToDispatchers(), + false, + false, + 4096, + 0, + 200, + common.DefaultMode, + dispatcherSink.Sink(), + sharedInfo, + ) + + event := commonEvent.NewSyncPointEvent(dispatcherID, 120, 1, 0) + dispatcher.pendingACKCount.Store(1) + dispatcher.DealWithBlockEvent(event) + require.NotNil(t, dispatcher.holdingBlockEvent) + require.Equal(t, 0, dispatcher.resendTaskMap.Len()) + + dispatcher.blockEventStatus.recordCompleted(BlockEventIdentifier{CommitTs: 120, IsSyncPoint: true}) + dispatcher.pendingACKCount.Store(0) + dispatcher.tryDealWithHeldBlockEvent() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + status := dispatcher.TakeBlockStatus(ctx) + require.NotNil(t, status) + require.Equal(t, heartbeatpb.BlockStage_DONE, status.State.Stage) + require.Equal(t, uint64(120), status.State.BlockTs) + require.True(t, status.State.IsSyncPoint) + require.Equal(t, 0, dispatcher.resendTaskMap.Len()) + require.Nil(t, dispatcher.blockEventStatus.getEvent()) +} + func newTestBasicDispatcher(t *testing.T, sinkType common.SinkType, enableActiveActive bool) *BasicDispatcher { t.Helper() sharedInfo := newTestSharedInfo(enableActiveActive, false, nil) diff --git a/downstreamadapter/dispatcher/helper.go b/downstreamadapter/dispatcher/helper.go index c76b333832..580464df6c 100644 --- a/downstreamadapter/dispatcher/helper.go +++ b/downstreamadapter/dispatcher/helper.go @@ -86,6 +86,8 @@ type BlockEventStatus struct { blockPendingEvent commonEvent.BlockEvent blockStage heartbeatpb.BlockStage blockCommitTs uint64 + completed BlockEventIdentifier + hasCompleted bool } func (b *BlockEventStatus) clear() { @@ -106,6 +108,33 @@ func (b *BlockEventStatus) setBlockEvent(event commonEvent.BlockEvent, blockStag b.blockCommitTs = event.GetCommitTs() } +func (b *BlockEventStatus) isCompletedOrObsolete(event commonEvent.BlockEvent) bool { + b.mutex.Lock() + defer b.mutex.Unlock() + + if !b.hasCompleted { + return false + } + return compareBlockEventIdentifier(blockEventIdentifier(event), b.completed) <= 0 +} + +func (b *BlockEventStatus) isDMLCompletedOrObsolete(commitTs uint64) bool { + b.mutex.Lock() + defer b.mutex.Unlock() + + return b.hasCompleted && commitTs <= b.completed.CommitTs +} + +func (b *BlockEventStatus) recordCompleted(identifier BlockEventIdentifier) { + b.mutex.Lock() + defer b.mutex.Unlock() + + if !b.hasCompleted || compareBlockEventIdentifier(identifier, b.completed) > 0 { + b.completed = identifier + b.hasCompleted = true + } +} + func (b *BlockEventStatus) updateBlockStage(blockStage heartbeatpb.BlockStage) { b.mutex.Lock() defer b.mutex.Unlock() @@ -139,7 +168,8 @@ func (b *BlockEventStatus) actionMatchs(action *heartbeatpb.DispatcherAction) bo return false } - return b.blockCommitTs == action.CommitTs + pendingIsSyncPoint := b.blockPendingEvent.GetType() == commonEvent.TypeSyncPointEvent + return b.blockCommitTs == action.CommitTs && pendingIsSyncPoint == action.IsSyncPoint } // ignoredStatusMatches checks whether the ignored status is for the current pending ddl/sync point event. @@ -169,6 +199,29 @@ func (b *BlockEventStatus) getEventCommitTs() (uint64, bool) { return b.blockCommitTs, true } +func blockEventIdentifier(event commonEvent.BlockEvent) BlockEventIdentifier { + return BlockEventIdentifier{ + CommitTs: event.GetCommitTs(), + IsSyncPoint: event.GetType() == commonEvent.TypeSyncPointEvent, + } +} + +func compareBlockEventIdentifier(a, b BlockEventIdentifier) int { + if a.CommitTs < b.CommitTs { + return -1 + } + if a.CommitTs > b.CommitTs { + return 1 + } + if a.IsSyncPoint == b.IsSyncPoint { + return 0 + } + if !a.IsSyncPoint && b.IsSyncPoint { + return -1 + } + return 1 +} + type SchemaIDToDispatchers struct { mutex sync.RWMutex m map[int64]map[common.DispatcherID]interface{} diff --git a/downstreamadapter/dispatcher/helper_test.go b/downstreamadapter/dispatcher/helper_test.go new file mode 100644 index 0000000000..e811a41e19 --- /dev/null +++ b/downstreamadapter/dispatcher/helper_test.go @@ -0,0 +1,48 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package dispatcher + +import ( + "testing" + + "github.com/pingcap/ticdc/heartbeatpb" + "github.com/pingcap/ticdc/pkg/common" + commonEvent "github.com/pingcap/ticdc/pkg/common/event" + "github.com/stretchr/testify/require" +) + +func TestBlockEventStatusCompletedWatermark(t *testing.T) { + var status BlockEventStatus + ddl10 := &commonEvent.DDLEvent{FinishedTs: 10} + syncpoint10 := commonEvent.NewSyncPointEvent(common.NewDispatcherID(), 10, 1, 0) + ddl11 := &commonEvent.DDLEvent{FinishedTs: 11} + + status.recordCompleted(BlockEventIdentifier{CommitTs: 10, IsSyncPoint: false}) + require.True(t, status.isCompletedOrObsolete(ddl10)) + require.False(t, status.isCompletedOrObsolete(syncpoint10)) + require.False(t, status.isCompletedOrObsolete(ddl11)) + + status.recordCompleted(BlockEventIdentifier{CommitTs: 10, IsSyncPoint: true}) + require.True(t, status.isCompletedOrObsolete(ddl10)) + require.True(t, status.isCompletedOrObsolete(syncpoint10)) + require.False(t, status.isCompletedOrObsolete(ddl11)) +} + +func TestBlockEventStatusActionMatchesSyncPointFlag(t *testing.T) { + var status BlockEventStatus + status.setBlockEvent(&commonEvent.DDLEvent{FinishedTs: 10}, heartbeatpb.BlockStage_WAITING) + + require.True(t, status.actionMatchs(&heartbeatpb.DispatcherAction{CommitTs: 10})) + require.False(t, status.actionMatchs(&heartbeatpb.DispatcherAction{CommitTs: 10, IsSyncPoint: true})) +} diff --git a/downstreamadapter/eventcollector/dispatcher_stat.go b/downstreamadapter/eventcollector/dispatcher_stat.go index 0153b3dc16..0b5b1164dc 100644 --- a/downstreamadapter/eventcollector/dispatcher_stat.go +++ b/downstreamadapter/eventcollector/dispatcher_stat.go @@ -137,6 +137,12 @@ func (d *dispatcherStat) advanceEpochForReset(resetTs uint64) uint64 { currentState := d.loadCurrentEpochState() nextState := newDispatcherEpochState(currentState.epoch+1, 0, resetTs) if d.currentEpoch.CompareAndSwap(currentState, nextState) { + // The new epoch replays events from resetTs. Commit-ts based + // deduplication from the old epoch must not filter replayed DDL or + // SyncPoint events. + d.lastEventCommitTs.Store(resetTs) + d.gotDDLOnTs.Store(false) + d.gotSyncpointOnTS.Store(false) return nextState.epoch } } diff --git a/downstreamadapter/eventcollector/dispatcher_stat_test.go b/downstreamadapter/eventcollector/dispatcher_stat_test.go index 9c3088a6eb..05bdcbf243 100644 --- a/downstreamadapter/eventcollector/dispatcher_stat_test.go +++ b/downstreamadapter/eventcollector/dispatcher_stat_test.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/node" + "github.com/pingcap/ticdc/utils/dynstream" "github.com/pingcap/tidb/pkg/util/chunk" "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/oracle" @@ -552,6 +553,53 @@ func TestUpdateCommitTsStateByEvents(t *testing.T) { require.Equal(t, uint64(110), state.maxEventTs.Load()) } +func TestAdvanceEpochForResetClearsCommitTsFilter(t *testing.T) { + t.Parallel() + + dispatcherID := common.NewDispatcherID() + eventServiceID := node.ID("event-service-1") + mockDisp := newMockDispatcher(dispatcherID, 100) + mockDisp.handleEvents = func(events []dispatcher.DispatcherEvent, wakeCallback func()) (block bool) { + return len(events) > 0 + } + + stat := newDispatcherStatForTest(mockDisp, nil) + stat.currentEpoch.Store(newDispatcherEpochState(10, 3, stat.target.GetStartTs())) + stat.lastEventCommitTs.Store(220) + stat.gotDDLOnTs.Store(true) + stat.gotSyncpointOnTS.Store(true) + + epoch := stat.advanceEpochForReset(150) + require.Equal(t, uint64(11), epoch) + require.Equal(t, uint64(150), stat.lastEventCommitTs.Load()) + require.False(t, stat.gotDDLOnTs.Load()) + require.False(t, stat.gotSyncpointOnTS.Load()) + + handshake := commonEvent.NewHandshakeEvent(dispatcherID, 160, epoch, &common.TableInfo{}) + stat.handleHandshakeEvent(dispatcher.DispatcherEvent{ + From: &eventServiceID, + Event: &handshake, + }) + + ddl := &commonEvent.DDLEvent{ + Version: commonEvent.DDLEventVersion1, + FinishedTs: 180, + Seq: 2, + Epoch: epoch, + } + require.True(t, stat.handleSingleDataEvents([]dispatcher.DispatcherEvent{ + { + From: &eventServiceID, + Event: ddl, + }, + })) + require.Len(t, mockDisp.events, 1) + require.Same(t, ddl, mockDisp.events[0].Event) + require.Equal(t, uint64(180), stat.lastEventCommitTs.Load()) + require.True(t, stat.gotDDLOnTs.Load()) + require.False(t, stat.gotSyncpointOnTS.Load()) +} + func TestHandleSignalEvent(t *testing.T) { localServerID := node.ID("local-server") remoteServerID := node.ID("remote-server") @@ -897,6 +945,42 @@ func TestInitialLocalReadyCallbackIsOneShot(t *testing.T) { requireNoDispatcherRequest(t, mockEventCollector) } +func TestReleasePathFeedbackResetsCurrentEventService(t *testing.T) { + localServerID := node.ID("local-server") + dispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("release_path_test", common.DefaultKeyspaceName) + mockDisp := newMockDispatcher(dispatcherID, 10) + mockDisp.changefeedID = cfID + mockDisp.checkPointTs = 20 + mockEventCollector := newTestEventCollector(localServerID) + stat := newDispatcherStat(mockDisp, mockEventCollector, nil) + setSessionState(stat.session, localServerID, false, "") + mockEventCollector.dispatcherMap.Store(dispatcherID, stat) + mockEventCollector.changefeedMap.Store(cfID.ID(), newChangefeedStat(cfID)) + + released := false + feedback := dynstream.Feedback[common.GID, common.DispatcherID, *dispatcherStat]{ + Area: cfID.ID(), + Path: dispatcherID, + FeedbackType: dynstream.ReleasePath, + } + mockEventCollector.handleReleasePathFeedback(feedback, func(path common.DispatcherID) { + released = true + require.Equal(t, dispatcherID, path) + }, "DS") + + require.True(t, released) + cfStatValue, ok := mockEventCollector.changefeedMap.Load(cfID.ID()) + require.True(t, ok) + require.Equal(t, uint32(1), cfStatValue.(*changefeedStat).memoryReleaseCount.Load()) + requireDispatcherRequests( + t, + readDispatcherRequests(t, mockEventCollector, 1), + dispatcherRequestRecord{to: localServerID, action: eventpb.ActionType_ACTION_TYPE_RESET}, + ) + requireNoDispatcherRequest(t, mockEventCollector) +} + func TestIsFromCurrentEpoch(t *testing.T) { t.Parallel() diff --git a/downstreamadapter/eventcollector/event_collector.go b/downstreamadapter/eventcollector/event_collector.go index 6456f1ada2..7319279a54 100644 --- a/downstreamadapter/eventcollector/event_collector.go +++ b/downstreamadapter/eventcollector/event_collector.go @@ -448,24 +448,38 @@ func (c *EventCollector) processDSFeedback(ctx context.Context) error { return context.Cause(ctx) case feedback := <-c.ds.Feedback(): if feedback.FeedbackType == dynstream.ReleasePath { - if v, ok := c.changefeedMap.Load(feedback.Area); ok { - v.(*changefeedStat).memoryReleaseCount.Add(1) - } - log.Info("release dispatcher memory in DS", zap.Any("dispatcherID", feedback.Path)) - c.ds.Release(feedback.Path) + c.handleReleasePathFeedback(feedback, c.ds.Release, "DS") } case feedback := <-c.redoDs.Feedback(): if feedback.FeedbackType == dynstream.ReleasePath { - if v, ok := c.changefeedMap.Load(feedback.Area); ok { - v.(*changefeedStat).memoryReleaseCount.Add(1) - } - log.Info("release dispatcher memory in redo DS", zap.Any("dispatcherID", feedback.Path)) - c.redoDs.Release(feedback.Path) + c.handleReleasePathFeedback(feedback, c.redoDs.Release, "redo DS") } } } } +func (c *EventCollector) handleReleasePathFeedback( + feedback dynstream.Feedback[common.GID, common.DispatcherID, *dispatcherStat], + release func(common.DispatcherID), + streamName string, +) { + if v, ok := c.changefeedMap.Load(feedback.Area); ok { + v.(*changefeedStat).memoryReleaseCount.Add(1) + } + log.Info("release dispatcher memory in "+streamName, zap.Any("dispatcherID", feedback.Path)) + release(feedback.Path) + + stat := c.getDispatcherStatByID(feedback.Path) + if stat == nil { + return + } + log.Info("reset dispatcher after releasing queued events", + zap.Stringer("changefeedID", stat.target.GetChangefeedID()), + zap.Stringer("dispatcherID", feedback.Path), + zap.String("stream", streamName)) + stat.session.resetCurrentEventService() +} + func (c *EventCollector) sendDispatcherRequests(ctx context.Context) error { for { select { diff --git a/maintainer/barrier.go b/maintainer/barrier.go index 75e5cc2482..0ee93f7de8 100644 --- a/maintainer/barrier.go +++ b/maintainer/barrier.go @@ -19,6 +19,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/maintainer/operator" + "github.com/pingcap/ticdc/maintainer/replica" "github.com/pingcap/ticdc/maintainer/span" "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/messaging" @@ -252,7 +253,7 @@ func (b *Barrier) Resend() []*messaging.TargetMessage { eventList := make([]*BarrierEvent, 0) b.blockedEvents.Range(func(key eventKey, barrierEvent *BarrierEvent) bool { // todo: we can limit the number of messages to send in one round here - msgs = append(msgs, barrierEvent.resend(b.mode)...) + msgs = append(msgs, barrierEvent.resendWithSchedule(b.mode, b.tryScheduleEvent)...) eventList = append(eventList, barrierEvent) return true @@ -308,7 +309,7 @@ func (b *Barrier) handleOneStatus(changefeedID *heartbeatpb.ChangefeedID, status Mode: status.Mode, }) if status.State != nil { - span.UpdateBlockState(*status.State) + updateSpanBlockState(span, status.State) } } if status.State.Stage == heartbeatpb.BlockStage_DONE { @@ -317,6 +318,38 @@ func (b *Barrier) handleOneStatus(changefeedID *heartbeatpb.ChangefeedID, status return b.handleBlockState(cfID, dispatcherID, status) } +func updateSpanBlockState(span *replica.SpanReplication, newState *heartbeatpb.State) { + oldState := span.GetBlockState() + if oldState != nil && compareBlockState(oldState, newState) > 0 { + log.Debug("ignore stale block state", + zap.String("dispatcher", span.ID.String()), + zap.Uint64("oldBlockTs", oldState.BlockTs), + zap.Bool("oldIsSyncPoint", oldState.IsSyncPoint), + zap.String("oldStage", oldState.Stage.String()), + zap.Uint64("newBlockTs", newState.BlockTs), + zap.Bool("newIsSyncPoint", newState.IsSyncPoint), + zap.String("newStage", newState.Stage.String())) + return + } + span.UpdateBlockState(*newState) +} + +func compareBlockState(a, b *heartbeatpb.State) int { + if a.BlockTs < b.BlockTs { + return -1 + } + if a.BlockTs > b.BlockTs { + return 1 + } + if a.IsSyncPoint != b.IsSyncPoint { + if !a.IsSyncPoint && b.IsSyncPoint { + return -1 + } + return 1 + } + return int(a.Stage) - int(b.Stage) +} + func (b *Barrier) handleEventDone(changefeedID common.ChangeFeedID, dispatcherID common.DispatcherID, status *heartbeatpb.TableSpanBlockStatus) *BarrierEvent { key := getEventKey(status.State.BlockTs, status.State.IsSyncPoint) event, ok := b.blockedEvents.Get(key) diff --git a/maintainer/barrier_event.go b/maintainer/barrier_event.go index 431aeac039..92031a504a 100644 --- a/maintainer/barrier_event.go +++ b/maintainer/barrier_event.go @@ -224,11 +224,8 @@ func (be *BarrierEvent) onAllDispatcherReportedBlockEvent(dispatcherID common.Di } // Once the event enters selected state, we start a new reporting phase that - // tracks completion after write/pass rather than the initial WAITING - // coverage. Reset both structures so DONE reports are measured from scratch. - be.rangeChecker.Reset() - be.reportedDispatchers = make(map[common.DispatcherID]struct{}) - + // tracks completion after write/pass rather than the initial WAITING coverage. + be.resetProgressAfterSelection() be.selected.Store(true) be.writerDispatcher = dispatcher be.lastResendTime = time.Now() @@ -325,6 +322,152 @@ func (be *BarrierEvent) addDispatchersToRangeChecker() { } } +func (be *BarrierEvent) ensureRangeChecker() { + if be.rangeChecker != nil || be.blockedDispatchers == nil { + return + } + + switch be.blockedDispatchers.InfluenceType { + case heartbeatpb.InfluenceType_Normal: + if be.dynamicSplitEnabled { + be.rangeChecker = range_checker.NewTableSpanRangeChecker(be.spanController.GetkeyspaceID(), be.blockedDispatchers.TableIDs) + } else { + be.rangeChecker = range_checker.NewTableCountChecker(be.blockedDispatchers.TableIDs) + } + case heartbeatpb.InfluenceType_DB: + be.createRangeCheckerForTypeDB() + case heartbeatpb.InfluenceType_All: + be.createRangeCheckerForTypeAll() + } +} + +func (be *BarrierEvent) getTasksByBlockedTableID(tableID int64) []*replica.SpanReplication { + if tableID != common.DDLSpanTableID { + return be.spanController.GetTasksByTableID(tableID) + } + ddlReplication := be.spanController.GetTaskByID(be.spanController.GetDDLDispatcherID()) + if ddlReplication == nil { + return nil + } + return []*replica.SpanReplication{ddlReplication} +} + +func (be *BarrierEvent) relatedReplications() []*replica.SpanReplication { + if be.blockedDispatchers == nil { + return nil + } + + switch be.blockedDispatchers.InfluenceType { + case heartbeatpb.InfluenceType_Normal: + replications := make([]*replica.SpanReplication, 0, len(be.blockedDispatchers.TableIDs)) + for _, tableID := range be.blockedDispatchers.TableIDs { + replications = append(replications, be.getTasksByBlockedTableID(tableID)...) + } + return replications + case heartbeatpb.InfluenceType_DB: + replications := be.spanController.GetTasksBySchemaID(be.blockedDispatchers.SchemaID) + if ddlReplication := be.spanController.GetTaskByID(be.spanController.GetDDLDispatcherID()); ddlReplication != nil { + replications = append(replications, ddlReplication) + } + return replications + case heartbeatpb.InfluenceType_All: + return be.spanController.GetAllTasks() + } + return nil +} + +func (be *BarrierEvent) addAdvancedReplicationsToRangeChecker() { + if be.rangeChecker == nil { + return + } + + for _, replication := range be.relatedReplications() { + if replication == nil || !forwardBarrierEvent(replication, be) { + continue + } + be.reportedDispatchers[replication.ID] = struct{}{} + be.rangeChecker.AddSubRange(replication.Span.TableID, replication.Span.StartKey, replication.Span.EndKey) + } +} + +func (be *BarrierEvent) refreshSelectedProgress() bool { + be.ensureRangeChecker() + be.addAdvancedReplicationsToRangeChecker() + if be.writerDispatcherAdvanced { + return false + } + + writer := be.spanController.GetTaskByID(be.writerDispatcher) + if writer == nil || !forwardBarrierEvent(writer, be) { + return false + } + if be.needSchedule { + return true + } + be.writerDispatcherAdvanced = true + be.lastResendTime = time.Now().Add(-20 * time.Second) + return true +} + +func (be *BarrierEvent) resetProgressAfterSelection() { + be.ensureRangeChecker() + if be.rangeChecker != nil { + be.rangeChecker.Reset() + } + be.reportedDispatchers = make(map[common.DispatcherID]struct{}) + be.addAdvancedReplicationsToRangeChecker() +} + +func (be *BarrierEvent) selectByForwardedDispatcher() { + be.resetProgressAfterSelection() + be.selected.Store(true) + be.writerDispatcherAdvanced = true + be.passActionSent = false +} + +func (be *BarrierEvent) markMissingDroppedTablesDone() bool { + if be.blockedDispatchers == nil || be.blockedDispatchers.InfluenceType != heartbeatpb.InfluenceType_Normal || + be.dropDispatchers == nil || be.dropDispatchers.InfluenceType != heartbeatpb.InfluenceType_Normal { + return false + } + + be.ensureRangeChecker() + if be.rangeChecker == nil { + return false + } + + marked := false + for _, tableID := range be.dropDispatchers.TableIDs { + if tableID == common.DDLSpanTableID || !containsTableID(be.blockedDispatchers.TableIDs, tableID) { + continue + } + if len(be.spanController.GetTasksByTableID(tableID)) != 0 { + continue + } + if be.spanController.GetTaskByID(be.spanController.GetDDLDispatcherID()) == nil { + continue + } + + be.markTableDone(tableID) + marked = true + log.Info("blocked table has no active dispatcher, mark it done", + zap.String("changefeed", be.cfID.Name()), + zap.Uint64("commitTs", be.commitTs), + zap.Int64("tableID", tableID), + zap.Int64("mode", be.mode)) + } + return marked +} + +func containsTableID(tableIDs []int64, target int64) bool { + for _, tableID := range tableIDs { + if tableID == target { + return true + } + } + return false +} + func (be *BarrierEvent) markDispatcherEventDone(dispatcherID common.DispatcherID) { if be.selected.Load() { // After selection, every accepted status means the chosen write/pass path @@ -480,7 +623,7 @@ func (be *BarrierEvent) sendPassAction(mode int64) []*messaging.TargetMessage { } case heartbeatpb.InfluenceType_Normal: for _, tableID := range be.blockedDispatchers.TableIDs { - spans := be.spanController.GetTasksByTableID(tableID) + spans := be.getTasksByBlockedTableID(tableID) if len(spans) == 0 { be.markTableDone(tableID) } else { @@ -521,17 +664,30 @@ func (be *BarrierEvent) sendPassAction(mode int64) []*messaging.TargetMessage { func (be *BarrierEvent) checkBlockedDispatchers() { switch be.blockedDispatchers.InfluenceType { case heartbeatpb.InfluenceType_Normal: - for _, tableId := range be.blockedDispatchers.TableIDs { - replications := be.spanController.GetTasksByTableID(tableId) + if be.markMissingDroppedTablesDone() && be.allDispatcherReported() { + // A normal DDL barrier can be recreated by a late WAITING status after the + // original barrier has already scheduled the drop and removed the table + // dispatcher. The removed table cannot report again, so advance the + // recreated barrier and let sendPassAction notify the remaining DDL span. + be.selectByForwardedDispatcher() + log.Info("all missing dropped blocked tables are removed, advance block event", + zap.String("changefeed", be.cfID.Name()), + zap.Uint64("commitTs", be.commitTs), + zap.Any("blocker", be.blockedDispatchers), + zap.Int64("mode", be.mode)) + return + } + + for _, tableID := range be.blockedDispatchers.TableIDs { + replications := be.getTasksByBlockedTableID(tableID) for _, replication := range replications { if forwardBarrierEvent(replication, be) { // one related table has forward checkpointTs, means the block event can be advanced - be.selected.Store(true) - be.writerDispatcherAdvanced = true + be.selectByForwardedDispatcher() log.Info("one related dispatcher has forward checkpointTs, means the block event can be advanced", zap.String("changefeed", be.cfID.Name()), zap.Uint64("commitTs", be.commitTs), - zap.Int64("tableId", tableId), + zap.Int64("tableID", tableID), zap.Uint64("checkpointTs", replication.GetStatus().CheckpointTs), zap.String("dispatcher", replication.ID.String()), zap.Int64("mode", be.mode), @@ -546,8 +702,7 @@ func (be *BarrierEvent) checkBlockedDispatchers() { for _, replication := range replications { if forwardBarrierEvent(replication, be) { // One related dispatcher has moved past the barrier, so the block event can advance. - be.selected.Store(true) - be.writerDispatcherAdvanced = true + be.selectByForwardedDispatcher() log.Info("one related dispatcher has forward checkpointTs, means the block event can be advanced", zap.String("changefeed", be.cfID.Name()), zap.Uint64("commitTs", be.commitTs), @@ -564,8 +719,7 @@ func (be *BarrierEvent) checkBlockedDispatchers() { for _, replication := range replications { if forwardBarrierEvent(replication, be) { // One related dispatcher has moved past the barrier, so the block event can advance. - be.selected.Store(true) - be.writerDispatcherAdvanced = true + be.selectByForwardedDispatcher() log.Info("one related dispatcher has forward checkpointTs, means the block event can be advanced", zap.String("changefeed", be.cfID.Name()), zap.Uint64("commitTs", be.commitTs), @@ -582,10 +736,10 @@ func (be *BarrierEvent) checkBlockedDispatchers() { // forwardBarrierEvent returns true if `replication` is known to have passed `event`. // // We intentionally avoid `checkpointTs >= commitTs`: a dispatcher may be recreated with -// `startTs == commitTs` and not skip the syncpoint at that ts, so it may report -// `checkpointTs == commitTs` before the syncpoint is actually flushed. We only forward when the -// replication is strictly beyond the barrier, or when ordering guarantees it (replication is in a -// syncpoint barrier at the same ts while `event` is a DDL barrier). +// `startTs == commitTs` and still need to flush the syncpoint at that ts. We only forward when the +// replication is strictly beyond the barrier, when it already reported DONE for this exact barrier, +// or when ordering guarantees it (replication is in a syncpoint barrier at the same ts while `event` +// is a DDL barrier). func forwardBarrierEvent(replication *replica.SpanReplication, event *BarrierEvent) bool { if replication.GetStatus().CheckpointTs > event.commitTs { return true @@ -596,6 +750,9 @@ func forwardBarrierEvent(replication *replica.SpanReplication, event *BarrierEve if blockState.BlockTs > event.commitTs { return true } else if blockState.BlockTs == event.commitTs { + if blockState.Stage == heartbeatpb.BlockStage_DONE && blockState.IsSyncPoint == event.isSyncPoint { + return true + } // If the replication is already blocked by a syncpoint at the same ts, it must have // processed the DDL barrier at that ts already (barrier events are ordered by (commitTs, isSyncPoint)). if blockState.IsSyncPoint && !event.isSyncPoint { @@ -607,6 +764,10 @@ func forwardBarrierEvent(replication *replica.SpanReplication, event *BarrierEve } func (be *BarrierEvent) resend(mode int64) []*messaging.TargetMessage { + return be.resendWithSchedule(mode, nil) +} + +func (be *BarrierEvent) resendWithSchedule(mode int64, trySchedule func(*BarrierEvent) bool) []*messaging.TargetMessage { now := time.Now() if now.Sub(be.lastResendTime) < time.Second { return nil @@ -657,6 +818,12 @@ func (be *BarrierEvent) resend(mode int64) []*messaging.TargetMessage { be.checkBlockedDispatchers() return nil } + writerForwarded := be.refreshSelectedProgress() + if writerForwarded && be.needSchedule && !be.writerDispatcherAdvanced { + if trySchedule == nil || !trySchedule(be) { + return nil + } + } // we select a dispatcher as the writer, still waiting for that dispatcher advance its checkpoint ts if !be.writerDispatcherAdvanced { be.lastResendTime = now @@ -684,7 +851,7 @@ func (be *BarrierEvent) resend(mode int64) []*messaging.TargetMessage { } tableID := be.blockedDispatchers.TableIDs[0] - replications := be.spanController.GetTasksByTableID(tableID) + replications := be.getTasksByBlockedTableID(tableID) if len(replications) == 0 { log.Panic("replications for this block event should not be empty", diff --git a/maintainer/barrier_test.go b/maintainer/barrier_test.go index 5e982762a5..de06d14ce8 100644 --- a/maintainer/barrier_test.go +++ b/maintainer/barrier_test.go @@ -826,6 +826,491 @@ func TestSyncPointBlock(t *testing.T) { require.Len(t, barrier.blockedEvents.m, 0) } +func TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers(t *testing.T) { + testutil.SetUpTestServices(t) + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: 1}, 1) + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: 2}, 1) + absents := spanController.GetAbsentForTest(10000) + require.Len(t, absents, 2) + for _, stm := range absents { + spanController.BindSpanToNode("", "node1", stm) + spanController.MarkSpanReplicating(stm) + } + + barrier := NewBarrier(spanController, operatorController, false, nil, common.DefaultMode) + waitingState := &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_All, + }, + IsSyncPoint: true, + } + + msgs := barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: spanController.GetDDLDispatcherID().ToPB(), State: waitingState}, + {ID: absents[0].ID.ToPB(), State: waitingState}, + {ID: absents[1].ID.ToPB(), State: waitingState}, + }, + }) + require.NotEmpty(t, msgs) + key := getEventKey(10, true) + event := barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.True(t, event.selected.Load()) + + doneState := &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_DONE, + IsSyncPoint: true, + } + _ = barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: spanController.GetDDLDispatcherID().ToPB(), State: doneState}, + {ID: absents[0].ID.ToPB(), State: doneState}, + {ID: absents[1].ID.ToPB(), State: doneState}, + }, + }) + require.Len(t, barrier.blockedEvents.m, 0) + + // A late WAITING report for the same syncpoint can recreate the barrier after + // the first one was removed. The recreated event must still count dispatchers + // whose span state already says DONE for this exact syncpoint. + _ = barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: absents[1].ID.ToPB(), State: waitingState}, + }, + }) + event = barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.Nil(t, event.rangeChecker) + + absents[1].UpdateStatus(&heartbeatpb.TableSpanStatus{ + ID: absents[1].ID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 11, + }) + + resendMsgs := barrier.Resend() + require.Empty(t, resendMsgs) + require.Len(t, barrier.blockedEvents.m, 0) +} + +func TestNormalBarrierRecreatedAfterDroppedTableRemoved(t *testing.T) { + testutil.SetUpTestServices(t) + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + + oldTableID := int64(267) + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: oldTableID}, 1) + oldReplication := spanController.GetTasksByTableID(oldTableID)[0] + spanController.BindSpanToNode("", "node1", oldReplication) + spanController.MarkSpanReplicating(oldReplication) + + barrier := NewBarrier(spanController, operatorController, false, nil, common.DefaultMode) + dropState := &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + TableIDs: []int64{oldTableID, common.DDLSpanTableID}, + }, + NeedDroppedTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + TableIDs: []int64{oldTableID}, + }, + } + + // The original barrier has already scheduled the DROP TABLE and removed the + // table dispatcher. A late WAITING report from the DDL dispatcher recreates + // the same barrier and must not wait forever for the removed table dispatcher. + spanController.RemoveByTableIDs(oldTableID) + msgs := barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: spanController.GetDDLDispatcherID().ToPB(), State: dropState}, + }, + }) + require.NotEmpty(t, msgs) + key := getEventKey(10, false) + event := barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.False(t, event.selected.Load()) + + resendMsgs := barrier.Resend() + require.Empty(t, resendMsgs) + event = barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.True(t, event.selected.Load()) + require.True(t, event.writerDispatcherAdvanced) + + resendMsgs = barrier.Resend() + require.Len(t, resendMsgs, 1) + resp := resendMsgs[0].Message[0].(*heartbeatpb.HeartBeatResponse) + require.Len(t, resp.DispatcherStatuses, 1) + require.Equal(t, heartbeatpb.Action_Pass, resp.DispatcherStatuses[0].Action.Action) + require.Equal(t, uint64(10), resp.DispatcherStatuses[0].Action.CommitTs) + require.Len(t, resp.DispatcherStatuses[0].InfluencedDispatchers.DispatcherIDs, 1) + require.Equal(t, spanController.GetDDLDispatcherID().ToPB(), resp.DispatcherStatuses[0].InfluencedDispatchers.DispatcherIDs[0]) + + doneState := &heartbeatpb.State{IsBlocked: true, BlockTs: 10, Stage: heartbeatpb.BlockStage_DONE} + _ = barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: spanController.GetDDLDispatcherID().ToPB(), State: doneState}, + }, + }) + require.Len(t, barrier.blockedEvents.m, 0) +} + +func TestNormalBarrierDoesNotCoverMissingNonDroppedTable(t *testing.T) { + testutil.SetUpTestServices(t) + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + + missingTableID := int64(267) + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: missingTableID}, 1) + missingReplication := spanController.GetTasksByTableID(missingTableID)[0] + spanController.BindSpanToNode("", "node1", missingReplication) + spanController.MarkSpanReplicating(missingReplication) + spanController.RemoveByTableIDs(missingTableID) + + barrier := NewBarrier(spanController, operatorController, false, nil, common.DefaultMode) + alterState := &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + TableIDs: []int64{missingTableID, common.DDLSpanTableID}, + }, + } + msgs := barrier.HandleStatus("node1", &heartbeatpb.BlockStatusRequest{ + ChangefeedID: cfID.ToPB(), + BlockStatuses: []*heartbeatpb.TableSpanBlockStatus{ + {ID: spanController.GetDDLDispatcherID().ToPB(), State: alterState}, + }, + }) + require.NotEmpty(t, msgs) + key := getEventKey(10, false) + event := barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.False(t, event.selected.Load()) + + resendMsgs := barrier.Resend() + require.Empty(t, resendMsgs) + event = barrier.blockedEvents.m[key] + require.NotNil(t, event) + require.False(t, event.selected.Load()) + require.Contains(t, event.rangeChecker.Detail(), "267") +} + +func TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID(t *testing.T) { + testutil.SetUpTestServices(t) + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 11, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: 1}, 1) + tableReplication := spanController.GetTasksByTableID(1)[0] + spanController.BindSpanToNode("", "node1", tableReplication) + spanController.MarkSpanReplicating(tableReplication) + + state := &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + TableIDs: []int64{1, common.DDLSpanTableID}, + }, + } + event := NewBlockEvent(cfID, tableReplication.ID, spanController, operatorController, state, false, common.DefaultMode) + require.False(t, event.selected.Load()) + require.False(t, event.allDispatcherReported()) + + event.checkBlockedDispatchers() + require.True(t, event.selected.Load()) + require.True(t, event.writerDispatcherAdvanced) + + event.lastResendTime = time.Now().Add(-2 * time.Second) + msgs := event.resend(common.DefaultMode) + require.Len(t, msgs, 1) + resp := msgs[0].Message[0].(*heartbeatpb.HeartBeatResponse) + require.Len(t, resp.DispatcherStatuses, 1) + status := resp.DispatcherStatuses[0] + require.Equal(t, heartbeatpb.Action_Pass, status.Action.Action) + require.Equal(t, uint64(10), status.Action.CommitTs) + require.Len(t, status.InfluencedDispatchers.DispatcherIDs, 2) + + gotDispatchers := make(map[common.DispatcherID]struct{}, len(status.InfluencedDispatchers.DispatcherIDs)) + for _, dispatcherID := range status.InfluencedDispatchers.DispatcherIDs { + gotDispatchers[common.NewDispatcherIDFromPB(dispatcherID)] = struct{}{} + } + _, ok := gotDispatchers[tableReplication.ID] + require.True(t, ok) + _, ok = gotDispatchers[spanController.GetDDLDispatcherID()] + require.True(t, ok) +} + +func TestResendSchedulesForwardedNeedScheduleBarrierBeforePass(t *testing.T) { + testutil.SetUpTestServices(t) + nodeManager := appcontext.GetService[*watcher.NodeManager](watcher.NodeManagerName) + nodeManager.GetAliveNodes()["node1"] = &node.Info{ID: "node1"} + + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 20, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + barrier := NewBarrier(spanController, operatorController, false, nil, common.DefaultMode) + + event := NewBlockEvent(cfID, tableTriggerEventDispatcherID, spanController, operatorController, &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_Normal, + TableIDs: []int64{common.DDLSpanTableID}, + }, + NeedAddedTables: []*heartbeatpb.Table{{SchemaID: 1, TableID: 2}}, + }, false, common.DefaultMode) + event.selected.Store(true) + event.writerDispatcher = tableTriggerEventDispatcherID + event.lastResendTime = time.Now().Add(-2 * time.Second) + barrier.blockedEvents.Set(getEventKey(10, false), event) + barrier.pendingEvents.add(event) + + msgs := barrier.Resend() + require.NotEmpty(t, msgs) + require.True(t, event.writerDispatcherAdvanced) + require.Equal(t, 0, barrier.pendingEvents.Len()) + require.Equal(t, 1, spanController.GetAbsentSize()) + resp := msgs[0].Message[0].(*heartbeatpb.HeartBeatResponse) + require.Equal(t, heartbeatpb.Action_Pass, resp.DispatcherStatuses[0].Action.Action) +} + +func TestSelectedBarrierRefreshesAdvancedReplications(t *testing.T) { + testutil.SetUpTestServices(t) + tableTriggerEventDispatcherID := common.NewDispatcherID() + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, + common.DDLSpanSchemaID, + common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ + ID: tableTriggerEventDispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + spanController := span.NewController(cfID, ddlSpan, nil, nil, nil, common.DefaultKeyspaceID, common.DefaultMode) + operatorController := operator.NewOperatorController(cfID, spanController, 1000, common.DefaultMode) + + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: 1}, 1) + spanController.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: 2}, 1) + absents := spanController.GetAbsentForTest(10000) + require.Len(t, absents, 2) + for _, stm := range absents { + spanController.BindSpanToNode("", "node1", stm) + spanController.MarkSpanReplicating(stm) + } + + event := NewBlockEvent(cfID, tableTriggerEventDispatcherID, spanController, operatorController, &heartbeatpb.State{ + IsBlocked: true, + BlockTs: 10, + Stage: heartbeatpb.BlockStage_WAITING, + BlockTables: &heartbeatpb.InfluencedTables{ + InfluenceType: heartbeatpb.InfluenceType_All, + }, + IsSyncPoint: true, + }, false, common.DefaultMode) + event.ensureRangeChecker() + event.selected.Store(true) + event.writerDispatcher = tableTriggerEventDispatcherID + event.lastResendTime = time.Now().Add(-2 * time.Second) + + for _, replication := range event.relatedReplications() { + replication.UpdateBlockState(heartbeatpb.State{ + IsBlocked: true, + BlockTs: 20, + Stage: heartbeatpb.BlockStage_WAITING, + }) + } + + _ = event.resend(common.DefaultMode) + require.True(t, event.writerDispatcherAdvanced) + require.True(t, event.allDispatcherReported()) +} + +func TestUpdateSpanBlockStateSkipsStaleState(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + dispatcherID := common.NewDispatcherID() + tableSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, 1) + replication := replica.NewWorkingSpanReplication(cfID, dispatcherID, 1, &tableSpan, &heartbeatpb.TableSpanStatus{ + ID: dispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1, + }, "node1", false) + + updateSpanBlockState(replication, &heartbeatpb.State{BlockTs: 10, IsSyncPoint: true, Stage: heartbeatpb.BlockStage_DONE}) + updateSpanBlockState(replication, &heartbeatpb.State{BlockTs: 10, IsSyncPoint: true, Stage: heartbeatpb.BlockStage_WAITING}) + state := replication.GetBlockState() + require.Equal(t, uint64(10), state.BlockTs) + require.True(t, state.IsSyncPoint) + require.Equal(t, heartbeatpb.BlockStage_DONE, state.Stage) + + updateSpanBlockState(replication, &heartbeatpb.State{BlockTs: 9, Stage: heartbeatpb.BlockStage_WAITING}) + state = replication.GetBlockState() + require.Equal(t, uint64(10), state.BlockTs) + require.True(t, state.IsSyncPoint) + require.Equal(t, heartbeatpb.BlockStage_DONE, state.Stage) + + updateSpanBlockState(replication, &heartbeatpb.State{BlockTs: 11, Stage: heartbeatpb.BlockStage_WAITING}) + state = replication.GetBlockState() + require.Equal(t, uint64(11), state.BlockTs) + require.False(t, state.IsSyncPoint) + require.Equal(t, heartbeatpb.BlockStage_WAITING, state.Stage) +} + +func TestForwardBarrierEventBoundaries(t *testing.T) { + cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) + tableSpan := common.TableIDToComparableSpan(common.DefaultKeyspaceID, 1) + newReplication := func(checkpointTs uint64, blockState *heartbeatpb.State) *replica.SpanReplication { + dispatcherID := common.NewDispatcherID() + replication := replica.NewWorkingSpanReplication(cfID, dispatcherID, + 1, &tableSpan, &heartbeatpb.TableSpanStatus{ + ID: dispatcherID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: checkpointTs, + }, "node1", false) + if blockState != nil { + replication.UpdateBlockState(*blockState) + } + return replication + } + + ddlEvent := &BarrierEvent{commitTs: 10, isSyncPoint: false} + syncpointEvent := &BarrierEvent{commitTs: 10, isSyncPoint: true} + tests := []struct { + name string + checkpointTs uint64 + blockState *heartbeatpb.State + event *BarrierEvent + want bool + }{ + { + name: "checkpoint equal commit ts does not forward syncpoint", + checkpointTs: 10, + event: syncpointEvent, + want: false, + }, + { + name: "checkpoint greater than commit ts forwards syncpoint", + checkpointTs: 11, + event: syncpointEvent, + want: true, + }, + { + name: "same ts syncpoint waiting does not forward syncpoint", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 10, IsSyncPoint: true, Stage: heartbeatpb.BlockStage_WAITING}, + event: syncpointEvent, + want: false, + }, + { + name: "same ts ddl done does not forward syncpoint", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 10, IsSyncPoint: false, Stage: heartbeatpb.BlockStage_DONE}, + event: syncpointEvent, + want: false, + }, + { + name: "same ts syncpoint done forwards syncpoint", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 10, IsSyncPoint: true, Stage: heartbeatpb.BlockStage_DONE}, + event: syncpointEvent, + want: true, + }, + { + name: "same ts syncpoint waiting forwards ddl", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 10, IsSyncPoint: true, Stage: heartbeatpb.BlockStage_WAITING}, + event: ddlEvent, + want: true, + }, + { + name: "same ts ddl done forwards ddl", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 10, IsSyncPoint: false, Stage: heartbeatpb.BlockStage_DONE}, + event: ddlEvent, + want: true, + }, + { + name: "later normal waiting forwards syncpoint", + checkpointTs: 9, + blockState: &heartbeatpb.State{BlockTs: 11, IsSyncPoint: false, Stage: heartbeatpb.BlockStage_WAITING}, + event: syncpointEvent, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + replication := newReplication(tt.checkpointTs, tt.blockState) + require.Equal(t, tt.want, forwardBarrierEvent(replication, tt.event)) + }) + } +} + func TestNonBlocked(t *testing.T) { testutil.SetUpTestServices(t) tableTriggerEventDispatcherID := common.NewDispatcherID() diff --git a/maintainer/operator/operator_move.go b/maintainer/operator/operator_move.go index d45dec35bf..ea7634351f 100644 --- a/maintainer/operator/operator_move.go +++ b/maintainer/operator/operator_move.go @@ -118,6 +118,7 @@ func (m *MoveDispatcherOperator) Check(from node.ID, status *heartbeatpb.TableSp if from == m.origin && status.ComponentStatus != heartbeatpb.ComponentState_Working { log.Info("replica set removed from origin node", zap.String("replicaSet", m.replicaSet.ID.String())) + m.replicaSet.UpdateStatus(status) // reset last send message time m.sendThrottler.reset() diff --git a/maintainer/operator/operator_move_test.go b/maintainer/operator/operator_move_test.go index 60558a9a07..7336606a4e 100644 --- a/maintainer/operator/operator_move_test.go +++ b/maintainer/operator/operator_move_test.go @@ -243,6 +243,26 @@ func TestMoveOperator_OriginNodeRemovedAfterOriginStopped(t *testing.T) { require.True(t, op.IsFinished()) } +func TestMoveOperatorUsesStoppedCheckpointWhenAddingDest(t *testing.T) { + spanController, _, replicaSet, nodeA, nodeB := setupTestEnvironment(t) + op := NewMoveDispatcherOperator(spanController, replicaSet, nodeA, nodeB) + + op.Start() + stoppedStatus := &heartbeatpb.TableSpanStatus{ + ID: replicaSet.ID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Stopped, + CheckpointTs: 1500, + } + op.Check(nodeA, stoppedStatus) + require.Equal(t, moveStateAddDest, op.state) + + msg := op.Schedule() + require.NotNil(t, msg) + req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) + require.Equal(t, heartbeatpb.ScheduleAction_Create, req.ScheduleAction) + require.Equal(t, uint64(1500), req.Config.StartTs) +} + func TestMoveOperator_BothNodesRemovedBeforeStartDoesNotLeaveSchedulingWithoutNodeID(t *testing.T) { messageCenter, _, _ := messaging.NewMessageCenterForTest(t) appcontext.SetService(appcontext.MessageCenter, messageCenter) diff --git a/maintainer/operator/operator_remove.go b/maintainer/operator/operator_remove.go index 43a95c21c6..6dff5387e6 100644 --- a/maintainer/operator/operator_remove.go +++ b/maintainer/operator/operator_remove.go @@ -97,6 +97,7 @@ func (m *removeDispatcherOperator) Check(from node.ID, status *heartbeatpb.Table (status.ComponentStatus == heartbeatpb.ComponentState_Stopped || status.ComponentStatus == heartbeatpb.ComponentState_Removed) { m.replicaSet.UpdateStatus(status) + m.spanController.RecordRemovedSpanCheckpoint(m.replicaSet, status.CheckpointTs) log.Info("dispatcher report non-working status", zap.String("replicaSet", m.replicaSet.ID.String())) m.finished.Store(true) diff --git a/maintainer/span/span_controller.go b/maintainer/span/span_controller.go index 40fcea4d47..5fca6b7d13 100644 --- a/maintainer/span/span_controller.go +++ b/maintainer/span/span_controller.go @@ -70,6 +70,10 @@ type Controller struct { tableTasks map[int64]map[common.DispatcherID]*replica.SpanReplication // nonReplicatingCheckpointTs tracks absent and scheduling spans so checkpoint calculation does not scan all spans. nonReplicatingCheckpointTs *checkpointTsTracker + // removedTableCheckpointTs records the highest checkpoint reported by removed dispatchers for each table. + // It prevents a stale add-table barrier from recreating a dispatcher below events already flushed by + // the previous dispatcher of the same physical table. + removedTableCheckpointTs map[int64]uint64 // newGroupChecker creates a GroupChecker for validating span groups newGroupChecker func(groupID pkgreplica.GroupID) pkgreplica.GroupChecker[common.DispatcherID, *replica.SpanReplication] @@ -115,6 +119,7 @@ func NewController( tableTasks: make(map[int64]map[common.DispatcherID]*replica.SpanReplication), allTasks: make(map[common.DispatcherID]*replica.SpanReplication), nonReplicatingCheckpointTs: newCheckpointTsTracker(), + removedTableCheckpointTs: make(map[int64]uint64), } c.ReplicationDB = pkgreplica.NewReplicationDB(changefeedID.String(), c.doWithRLock, c.newGroupChecker) c.initializeDDLSpan(ddlSpan) @@ -227,11 +232,47 @@ func (c *Controller) AddNewSpans(schemaID int64, tableSpans []*heartbeatpb.Table for _, span := range tableSpans { dispatcherID := common.NewDispatcherID() span.KeyspaceID = c.GetkeyspaceID() - replicaSet := replica.NewSpanReplication(c.changefeedID, dispatcherID, schemaID, span, startTs, c.mode, enabledSplit) + safeStartTs := c.getSafeStartTsForTable(span.TableID, startTs) + replicaSet := replica.NewSpanReplication(c.changefeedID, dispatcherID, schemaID, span, safeStartTs, c.mode, enabledSplit) c.AddAbsentReplicaSet(replicaSet) } } +// RecordRemovedSpanCheckpoint records the final checkpoint of a dispatcher that has been removed. +func (c *Controller) RecordRemovedSpanCheckpoint(span *replica.SpanReplication, checkpointTs uint64) { + if span == nil { + return + } + c.mu.Lock() + defer c.mu.Unlock() + c.advanceRemovedTableCheckpointTsWithoutLock(span.Span.TableID, checkpointTs) +} + +func (c *Controller) getSafeStartTsForTable(tableID int64, startTs uint64) uint64 { + c.mu.RLock() + removedCheckpointTs := c.removedTableCheckpointTs[tableID] + c.mu.RUnlock() + if removedCheckpointTs <= startTs { + return startTs + } + log.Info("clamp new table dispatcher start ts to removed table checkpoint", + zap.Stringer("changefeedID", c.changefeedID), + zap.Int64("tableID", tableID), + zap.Uint64("originalStartTs", startTs), + zap.Uint64("removedCheckpointTs", removedCheckpointTs)) + return removedCheckpointTs +} + +func (c *Controller) advanceRemovedTableCheckpointTsWithoutLock(tableID int64, checkpointTs uint64) { + if checkpointTs == 0 { + return + } + if old := c.removedTableCheckpointTs[tableID]; old >= checkpointTs { + return + } + c.removedTableCheckpointTs[tableID] = checkpointTs +} + func (c *Controller) GetMinCheckpointTsForNonReplicatingSpans(minCheckpointTs uint64) uint64 { c.mu.RLock() defer c.mu.RUnlock() @@ -590,6 +631,9 @@ func (c *Controller) RemoveBySchemaID(schemaID int64) { // removeSpanWithoutLock removes the spans from the db without lock func (c *Controller) removeSpanWithoutLock(spans ...*replica.SpanReplication) { for _, span := range spans { + if status := span.GetStatus(); status != nil { + c.advanceRemovedTableCheckpointTsWithoutLock(span.Span.TableID, status.CheckpointTs) + } c.RemoveReplicaWithoutLock(span) c.untrackNonReplicatingSpan(span) diff --git a/maintainer/span/span_controller_test.go b/maintainer/span/span_controller_test.go index 827f175e12..7357fee943 100644 --- a/maintainer/span/span_controller_test.go +++ b/maintainer/span/span_controller_test.go @@ -610,6 +610,63 @@ func TestMarkSpanAbsent(t *testing.T) { require.Equal(t, "", replicaSpan.GetNodeID().String()) } +func TestControllerAddNewTableClampsToRemovedTableCheckpoint(t *testing.T) { + controller := newControllerWithCheckerForTest(t) + tableID := int64(100) + oldID := common.NewDispatcherID() + oldSpan := replica.NewWorkingSpanReplication( + controller.changefeedID, + oldID, + 1, + testutil.GetTableSpanByID(tableID), + &heartbeatpb.TableSpanStatus{ + ID: oldID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1000, + }, + "node1", + false, + ) + + controller.AddReplicatingSpan(oldSpan) + controller.RemoveByTableIDs(tableID) + controller.RecordRemovedSpanCheckpoint(oldSpan, 1500) + + controller.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: tableID}, 900) + tasks := controller.GetTasksByTableID(tableID) + require.Len(t, tasks, 1) + require.Equal(t, uint64(1500), tasks[0].GetStatus().CheckpointTs) + + msg := tasks[0].NewAddDispatcherMessage("node1", heartbeatpb.OperatorType_O_Add) + req := msg.Message[0].(*heartbeatpb.ScheduleDispatcherRequest) + require.Equal(t, uint64(1500), req.Config.StartTs) +} + +func TestControllerAddNewTableIgnoresLowerRemovedTableCheckpoint(t *testing.T) { + controller := newControllerWithCheckerForTest(t) + tableID := int64(101) + oldID := common.NewDispatcherID() + oldSpan := replica.NewWorkingSpanReplication( + controller.changefeedID, + oldID, + 1, + testutil.GetTableSpanByID(tableID), + &heartbeatpb.TableSpanStatus{ + ID: oldID.ToPB(), + ComponentStatus: heartbeatpb.ComponentState_Working, + CheckpointTs: 1000, + }, + "node1", + false, + ) + + controller.RecordRemovedSpanCheckpoint(oldSpan, 800) + controller.AddNewTable(commonEvent.Table{SchemaID: 1, TableID: tableID}, 900) + tasks := controller.GetTasksByTableID(tableID) + require.Len(t, tasks, 1) + require.Equal(t, uint64(900), tasks[0].GetStatus().CheckpointTs) +} + func newControllerWithCheckerForTest(t *testing.T) *Controller { testutil.SetUpTestServices(t) cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) diff --git a/pkg/eventservice/event_scanner.go b/pkg/eventservice/event_scanner.go index b082ccfc13..d4a948e870 100644 --- a/pkg/eventservice/event_scanner.go +++ b/pkg/eventservice/event_scanner.go @@ -229,9 +229,11 @@ func (s *eventScanner) scanAndMergeEvents( if err != nil { return false, err } - // table is deleted, still append remaining DDL event and resolved event. + // The table has been deleted, so the current raw event cannot be + // decoded as DML. Resolve to its commit ts to skip it; resolving to + // rawEvent.CRTs-1 can equal the scan start and cause a no-progress loop. if tableInfo == nil { - err = finalizeScan(merger, processor, session, rawEvent.CRTs-1) + err = finalizeScan(merger, processor, session, rawEvent.CRTs) return false, err } diff --git a/pkg/eventservice/event_scanner_test.go b/pkg/eventservice/event_scanner_test.go index 7a76adeae9..bbf1cbdeb9 100644 --- a/pkg/eventservice/event_scanner_test.go +++ b/pkg/eventservice/event_scanner_test.go @@ -449,6 +449,7 @@ func TestEventScannerWithDeleteTable(t *testing.T) { dml0 := kvEvents[0] dml1 := kvEvents[1] dml2 := kvEvents[2] + dml3 := kvEvents[3] mockSchemaStore.DeleteTable(tableID, dml2.CRTs) disp.receivedResolvedTs.Store(resolvedTs) ok, dataRange := broker.getScanTaskDataRange(disp) @@ -480,10 +481,12 @@ func TestEventScannerWithDeleteTable(t *testing.T) { require.Equal(t, batchDML1.DMLEvents[0].GetCommitTs(), dml1.CRTs) require.Equal(t, batchDML1.DMLEvents[1].GetCommitTs(), dml2.CRTs) - // resolvedTs + // resolvedTs skips the first raw event after the table is deleted, so the + // next scan range will not keep seeing the same deleted-table event. e = events[3] require.Equal(t, e.GetType(), event.TypeResolvedEvent) - require.Equal(t, dml2.CRTs, e.GetCommitTs()) + require.Equal(t, dml3.CRTs, e.GetCommitTs()) + require.Greater(t, e.GetCommitTs(), dml2.CRTs) } // TestEventScannerWithDDL tests cases where scanning is interrupted at DDL events @@ -1567,6 +1570,66 @@ func TestScanAndMergeEventsSingleUKUpdate(t *testing.T) { require.True(t, sess.scannedBytes > 0) // Some bytes were processed } +func TestScanAndMergeEventsSkipsDeletedTableTxn(t *testing.T) { + helper := event.NewEventTestHelper(t) + defer helper.Close() + + ddlEvent, kvEvents := genEvents(helper, + `create table test.t_deleted(id int primary key, c char(50))`, + `insert into test.t_deleted(id,c) values (1, "c1")`) + require.Len(t, kvEvents, 1) + rawEvent := kvEvents[0] + tableID := ddlEvent.GetTableID() + + schemaStore := &schemaStoreWithErr{ + mockSchemaStore: NewMockSchemaStore(), + getTableInfoError: &schemastore.TableDeletedError{}, + } + scanner := &eventScanner{ + mounter: &mockMounter{}, + schemaGetter: schemaStore, + } + + disInfo := newMockDispatcherInfoForTest(t) + disInfo.span.TableID = tableID + dispatcherID := common.NewDispatcherID() + disp := &dispatcherStat{ + info: disInfo, + id: dispatcherID, + isRemoved: atomic.Bool{}, + } + + dataRange := common.DataRange{ + Span: &heartbeatpb.TableSpan{ + TableID: tableID, + }, + CommitTsStart: rawEvent.CRTs - 1, + CommitTsEnd: rawEvent.CRTs + 100, + } + sess := &session{ + ctx: context.Background(), + dispatcherStat: disp, + dataRange: dataRange, + startTime: time.Now(), + events: make([]event.Event, 0), + } + merger := newEventMerger(nil) + + isInterrupted, err := scanner.scanAndMergeEvents(sess, merger, &mockEventIterator{ + events: []*common.RawKVEntry{rawEvent}, + }) + require.NoError(t, err) + require.False(t, isInterrupted) + require.Zero(t, sess.dmlCount) + require.Len(t, sess.events, 1) + + resolvedEvent, ok := sess.events[0].(event.ResolvedEvent) + require.True(t, ok) + require.Equal(t, dispatcherID, resolvedEvent.DispatcherID) + require.Equal(t, rawEvent.CRTs, resolvedEvent.ResolvedTs) + require.Greater(t, resolvedEvent.ResolvedTs, dataRange.CommitTsStart) +} + type schemaStoreWithErr struct { *mockSchemaStore getTableInfoError error diff --git a/tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh b/tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh new file mode 100755 index 0000000000..f6bc101457 --- /dev/null +++ b/tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -eo pipefail + +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +sink_type=${1:-mysql} + +# This script is a standalone CI entry for the random DDL+DML suite. +# It intentionally stays outside run_heavy_it_in_ci.sh so the expensive weekly +# workload can be triggered independently from the regular heavy matrix. +case "${sink_type}" in +mysql) + test_names="weekly_rand_single weekly_rand_multi weekly_rand_multi_failover weekly_rand_slow_lossy_ddl" + ;; +kafka | storage | pulsar) + test_names="weekly_rand_single weekly_rand_multi weekly_rand_multi_failover" + ;; +*) + echo "Error: unknown sink type: ${sink_type}" + exit 1 + ;; +esac + +export TICDC_NEWARCH=true +export RUN_PROFILE=${RUN_PROFILE:-weekly} +export RUN_DURATION=${RUN_DURATION:-30m} +export RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT:-120m} +export RUN_SEED=${RUN_SEED:-$(date -u +%Y%m%d%H)} + +echo "Sink Type: ${sink_type}" +echo "Run cases: ${test_names}" +echo "RUN_PROFILE=${RUN_PROFILE}" +echo "RUN_DURATION=${RUN_DURATION}" +echo "RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT}" +echo "RUN_SEED=${RUN_SEED}" + +"${CUR}"/run.sh "${sink_type}" "${test_names}" diff --git a/tests/integration_tests/weekly_rand_multi/conf/changefeed.toml b/tests/integration_tests/weekly_rand_multi/conf/changefeed.toml new file mode 100644 index 0000000000..16252a3997 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi/conf/changefeed.toml @@ -0,0 +1,4 @@ +[scheduler] +enable-table-across-nodes=true +region-threshold=10 +region-count-per-span=10 diff --git a/tests/integration_tests/weekly_rand_multi/conf/changefeed_mysql.toml b/tests/integration_tests/weekly_rand_multi/conf/changefeed_mysql.toml new file mode 100644 index 0000000000..b1a3308066 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi/conf/changefeed_mysql.toml @@ -0,0 +1,7 @@ +enable-sync-point = true +sync-point-interval = "30s" + +[scheduler] +enable-table-across-nodes=true +region-threshold=10 +region-count-per-span=10 diff --git a/tests/integration_tests/weekly_rand_multi/conf/consumer.toml b/tests/integration_tests/weekly_rand_multi/conf/consumer.toml new file mode 100644 index 0000000000..28f76a83b9 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi/conf/consumer.toml @@ -0,0 +1,2 @@ +[scheduler] +enable-table-across-nodes=true diff --git a/tests/integration_tests/weekly_rand_multi/run.sh b/tests/integration_tests/weekly_rand_multi/run.sh new file mode 100644 index 0000000000..715c10e559 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi/run.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Random DDL+DML weekly smoke test (3 captures, scheduler enabled). +# +# Timeline (high level): +# 1) start_tidb_cluster: start upstream + downstream TiDB. +# 2) random_ddl_test_runner bootstrap: create identical schemas and deterministic seed data on both sides. +# 3) start TiCDC (3 captures) and create changefeed with scheduler settings. +# 4) random_ddl_test_runner workload: concurrent random DML + random DDL on upstream. +# 5) Final diff: sync_diff_inspector compares upstream vs downstream using diff_config.toml generated by the runner. +# 6) Post-check: scan logs for panic/fatal/data race patterns. +# +# Notes: +# - Storage sink: pre-create directories to avoid path creation races under multi-capture runs. +# +# Sequence diagram (simplified): +# run.sh +# |-> start_tidb_cluster +# |-> random_ddl_test_runner --phase bootstrap +# |-> run_cdc_server (capture=3) +# |-> cdc_cli_changefeed create (scheduler enabled) +# |-> consumer (kafka/storage/pulsar) [optional] +# |-> random_ddl_test_runner --phase workload +# |-> check_sync_diff (sync_diff_inspector) + +set -eu + +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +source $CUR/../_utils/test_prepare + +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +SINK_TYPE=$1 +CHANGEFEED_ID="weeklyrand" + +RUN_SEED=${RUN_SEED:-1} +RUN_PROFILE=${RUN_PROFILE:-smoke} +RUN_DURATION=${RUN_DURATION:-3m} +RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT:-} +if [ -z "$RUN_CONVERGE_TIMEOUT" ]; then + if [ "$RUN_PROFILE" == "weekly" ]; then + RUN_CONVERGE_TIMEOUT="120m" + else + RUN_CONVERGE_TIMEOUT="30m" + fi +fi + +function build_runner() { + mkdir -p "$WORK_DIR" + go build -o "$WORK_DIR/random_ddl_test_runner" "$CUR/../../utils/random_ddl_test_runner" +} + +function write_runner_config() { + local mysql_sync_enabled="false" + if [ "$SINK_TYPE" == "mysql" ]; then + mysql_sync_enabled="true" + fi + + cat >"$WORK_DIR/runner_config.json" </dev/null 2>&1 || true + cleanup_process cdc_kafka_consumer >/dev/null 2>&1 || true + cleanup_process cdc_storage_consumer >/dev/null 2>&1 || true + cleanup_process cdc_pulsar_consumer >/dev/null 2>&1 || true + stop_test $WORK_DIR +} + +trap 'cleanup' EXIT + +rm -rf $WORK_DIR && mkdir -p $WORK_DIR + +# start_tidb_cluster --workdir $WORK_DIR +cat >"$WORK_DIR/tidb_config.toml" </dev/null 2>&1; then + if rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 20 | rg -n . >/dev/null 2>&1; then + echo "log scan: panic/fatal/race detected" + rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 50 || true + exit 1 + fi +fi + +echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" diff --git a/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed.toml b/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed.toml new file mode 100644 index 0000000000..1ba6b2c025 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed.toml @@ -0,0 +1,5 @@ +[scheduler] +enable-table-across-nodes=false + +[sink.csv] +include-commit-ts = true diff --git a/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed_mysql.toml b/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed_mysql.toml new file mode 100644 index 0000000000..b1a3308066 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi_failover/conf/changefeed_mysql.toml @@ -0,0 +1,7 @@ +enable-sync-point = true +sync-point-interval = "30s" + +[scheduler] +enable-table-across-nodes=true +region-threshold=10 +region-count-per-span=10 diff --git a/tests/integration_tests/weekly_rand_multi_failover/conf/consumer.toml b/tests/integration_tests/weekly_rand_multi_failover/conf/consumer.toml new file mode 100644 index 0000000000..1ba6b2c025 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi_failover/conf/consumer.toml @@ -0,0 +1,5 @@ +[scheduler] +enable-table-across-nodes=false + +[sink.csv] +include-commit-ts = true diff --git a/tests/integration_tests/weekly_rand_multi_failover/run.sh b/tests/integration_tests/weekly_rand_multi_failover/run.sh new file mode 100644 index 0000000000..94800ba620 --- /dev/null +++ b/tests/integration_tests/weekly_rand_multi_failover/run.sh @@ -0,0 +1,210 @@ +#!/bin/bash + +# Random DDL+DML weekly smoke test (3 captures, random failover). +# +# Timeline (high level): +# 1) start_tidb_cluster: start upstream + downstream TiDB. +# 2) random_ddl_test_runner bootstrap: create identical schemas and deterministic seed data on both sides. +# 3) start TiCDC (3 captures) and create changefeed. +# 4) random_ddl_test_runner workload: +# - concurrent random DML + random DDL on upstream +# - random capture kill + restart (runner failover loop) +# 5) Final diff: sync_diff_inspector compares upstream vs downstream using diff_config.toml generated by the runner. +# 6) Post-check: scan logs for panic/fatal/data race patterns. +# +# Sequence diagram (simplified): +# run.sh +# |-> start_tidb_cluster +# |-> random_ddl_test_runner --phase bootstrap +# |-> run_cdc_server (capture=3) +# |-> cdc_cli_changefeed create +# |-> consumer (kafka/storage/pulsar) [optional] +# |-> random_ddl_test_runner --phase workload (includes failover) +# |-> check_sync_diff (sync_diff_inspector) + +set -eu + +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +source $CUR/../_utils/test_prepare + +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +SINK_TYPE=$1 +CHANGEFEED_ID="weeklyrand" + +RUN_SEED=${RUN_SEED:-1} +RUN_PROFILE=${RUN_PROFILE:-smoke} +RUN_DURATION=${RUN_DURATION:-3m} +RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT:-} +if [ -z "$RUN_CONVERGE_TIMEOUT" ]; then + if [ "$RUN_PROFILE" == "weekly" ]; then + RUN_CONVERGE_TIMEOUT="120m" + else + RUN_CONVERGE_TIMEOUT="30m" + fi +fi + +function build_runner() { + mkdir -p "$WORK_DIR" + go build -o "$WORK_DIR/random_ddl_test_runner" "$CUR/../../utils/random_ddl_test_runner" +} + +function write_runner_config() { + local mysql_sync_enabled="false" + if [ "$SINK_TYPE" == "mysql" ]; then + mysql_sync_enabled="true" + fi + + cat >"$WORK_DIR/runner_config.json" </dev/null 2>&1 || true + cleanup_process cdc_kafka_consumer >/dev/null 2>&1 || true + cleanup_process cdc_storage_consumer >/dev/null 2>&1 || true + cleanup_process cdc_pulsar_consumer >/dev/null 2>&1 || true + stop_test $WORK_DIR +} + +trap 'cleanup' EXIT + +rm -rf $WORK_DIR && mkdir -p $WORK_DIR + +# start_tidb_cluster --workdir $WORK_DIR +cat >"$WORK_DIR/tidb_config.toml" </dev/null 2>&1; then + if rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 20 | rg -n . >/dev/null 2>&1; then + echo "log scan: panic/fatal/race detected" + rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 50 || true + exit 1 + fi +fi + +echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" diff --git a/tests/integration_tests/weekly_rand_single/conf/changefeed_mysql.toml b/tests/integration_tests/weekly_rand_single/conf/changefeed_mysql.toml new file mode 100644 index 0000000000..d01100dc36 --- /dev/null +++ b/tests/integration_tests/weekly_rand_single/conf/changefeed_mysql.toml @@ -0,0 +1,2 @@ +enable-sync-point = true +sync-point-interval = "30s" diff --git a/tests/integration_tests/weekly_rand_single/conf/consumer.toml b/tests/integration_tests/weekly_rand_single/conf/consumer.toml new file mode 100644 index 0000000000..28f76a83b9 --- /dev/null +++ b/tests/integration_tests/weekly_rand_single/conf/consumer.toml @@ -0,0 +1,2 @@ +[scheduler] +enable-table-across-nodes=true diff --git a/tests/integration_tests/weekly_rand_single/run.sh b/tests/integration_tests/weekly_rand_single/run.sh new file mode 100644 index 0000000000..873fa6077a --- /dev/null +++ b/tests/integration_tests/weekly_rand_single/run.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +# Random DDL+DML weekly smoke test (single capture). +# +# Timeline (high level): +# 1) start_tidb_cluster: start upstream + downstream TiDB. +# 2) random_ddl_test_runner bootstrap: create identical schemas and deterministic seed data on both sides. +# 3) start TiCDC (single capture) and create changefeed (optionally start MQ/storage consumer). +# 4) random_ddl_test_runner workload: run concurrent random DML + random DDL against upstream only. +# 5) Final diff: sync_diff_inspector compares upstream vs downstream using diff_config.toml generated by the runner. +# 6) Post-check: scan logs for panic/fatal/data race patterns. +# +# Sequence diagram (simplified): +# run.sh +# |-> start_tidb_cluster +# |-> random_ddl_test_runner --phase bootstrap +# |-> run_cdc_server (capture=1) +# |-> cdc_cli_changefeed create +# |-> consumer (kafka/storage/pulsar) [optional] +# |-> random_ddl_test_runner --phase workload +# |-> check_sync_diff (sync_diff_inspector) + +set -eu + +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +source $CUR/../_utils/test_prepare + +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +SINK_TYPE=$1 +CHANGEFEED_ID="weeklyrand" + +RUN_SEED=${RUN_SEED:-1} +RUN_PROFILE=${RUN_PROFILE:-smoke} +RUN_DURATION=${RUN_DURATION:-3m} +RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT:-} +if [ -z "$RUN_CONVERGE_TIMEOUT" ]; then + if [ "$RUN_PROFILE" == "weekly" ]; then + RUN_CONVERGE_TIMEOUT="120m" + else + RUN_CONVERGE_TIMEOUT="30m" + fi +fi + +function build_runner() { + mkdir -p "$WORK_DIR" + go build -o "$WORK_DIR/random_ddl_test_runner" "$CUR/../../utils/random_ddl_test_runner" +} + +function write_runner_config() { + local mysql_sync_enabled="false" + if [ "$SINK_TYPE" == "mysql" ]; then + mysql_sync_enabled="true" + fi + + cat >"$WORK_DIR/runner_config.json" </dev/null 2>&1 || true + cleanup_process cdc_kafka_consumer >/dev/null 2>&1 || true + cleanup_process cdc_storage_consumer >/dev/null 2>&1 || true + cleanup_process cdc_pulsar_consumer >/dev/null 2>&1 || true + stop_test $WORK_DIR +} + +trap 'cleanup' EXIT + +rm -rf $WORK_DIR && mkdir -p $WORK_DIR + +# start_tidb_cluster --workdir $WORK_DIR +cat >"$WORK_DIR/tidb_config.toml" </dev/null 2>&1; then + if rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 20 | rg -n . >/dev/null 2>&1; then + echo "log scan: panic/fatal/race detected" + rg -n -i "panic|fatal|data race" "$WORK_DIR"/runner.log "$WORK_DIR"/ddl_trace.log "$WORK_DIR"/stdout*.log "$WORK_DIR"/cdc*.log "$WORK_DIR"/cdc_*_consumer*.log "$WORK_DIR"/cdc_*_consumer_stdout*.log 2>/dev/null | head -n 50 || true + exit 1 + fi +fi + +echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" diff --git a/tests/integration_tests/weekly_rand_slow_lossy_ddl/conf/changefeed_mysql.toml b/tests/integration_tests/weekly_rand_slow_lossy_ddl/conf/changefeed_mysql.toml new file mode 100644 index 0000000000..d01100dc36 --- /dev/null +++ b/tests/integration_tests/weekly_rand_slow_lossy_ddl/conf/changefeed_mysql.toml @@ -0,0 +1,2 @@ +enable-sync-point = true +sync-point-interval = "30s" diff --git a/tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh b/tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh new file mode 100644 index 0000000000..522e44cedc --- /dev/null +++ b/tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Random DDL+DML weekly smoke test: slow downstream apply for lossy DDL (MySQL sink only). +# +# Timeline (high level): +# 1) start_tidb_cluster: start upstream + downstream TiDB. +# 2) random_ddl_test_runner bootstrap: create identical schemas and deterministic seed data on both sides. +# 3) start TiCDC (single capture) with a failpoint that delays downstream DDL execution. +# 4) random_ddl_test_runner workload: run concurrent random DML + random DDL on upstream. +# 5) Final diff: sync_diff_inspector compares upstream vs downstream using diff_config.toml generated by the runner. +# +# Notes: +# - This case requires MySQL sink and sets GO_FAILPOINTS to delay MySQL sink DDL execution. + +set -eu + +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +source $CUR/../_utils/test_prepare + +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +SINK_TYPE=$1 +CHANGEFEED_ID="weeklyrand" + +RUN_SEED=${RUN_SEED:-1} +RUN_PROFILE=${RUN_PROFILE:-smoke} +RUN_DURATION=${RUN_DURATION:-3m} +RUN_CONVERGE_TIMEOUT=${RUN_CONVERGE_TIMEOUT:-} +if [ -z "$RUN_CONVERGE_TIMEOUT" ]; then + if [ "$RUN_PROFILE" == "weekly" ]; then + RUN_CONVERGE_TIMEOUT="120m" + else + RUN_CONVERGE_TIMEOUT="30m" + fi +fi +DDL_DELAY_SEC=${DDL_DELAY_SEC:-3} + +function build_runner() { + mkdir -p "$WORK_DIR" + go build -o "$WORK_DIR/random_ddl_test_runner" "$CUR/../../utils/random_ddl_test_runner" +} + +function write_runner_config() { + cat >"$WORK_DIR/runner_config.json" </dev/null 2>&1 || true + stop_test $WORK_DIR +} + +trap 'cleanup' EXIT + +if [ "$SINK_TYPE" != "mysql" ]; then + echo "Skip test since MySQL sink is required" + exit 0 +fi + +rm -rf $WORK_DIR && mkdir -p $WORK_DIR + +# start_tidb_cluster --workdir $WORK_DIR +cat >"$WORK_DIR/tidb_config.toml" <>>>>>" diff --git a/tests/utils/random_ddl_test_runner/autotune.go b/tests/utils/random_ddl_test_runner/autotune.go new file mode 100644 index 0000000000..8d0909393c --- /dev/null +++ b/tests/utils/random_ddl_test_runner/autotune.go @@ -0,0 +1,57 @@ +package main + +import "time" + +type autoTuneResult struct { + nextDML int32 + nextDDL int32 + fail bool +} + +func autoTuneStep( + sinceAdvance time.Duration, + successRate float64, + activeDML int32, + activeDDL int32, + maxDML int32, + maxDDL int32, + soft time.Duration, + hard time.Duration, +) autoTuneResult { + // autoTuneStep adjusts concurrency to keep replication progressing: + // - If checkpoint is stalled beyond "soft" or DML success rate collapses, scale down first. + // - If checkpoint is healthy, gradually scale up toward configured maxima. + // + // It returns fail=true only when checkpoint is stalled beyond "hard". + if sinceAdvance >= hard { + return autoTuneResult{fail: true} + } + + nextDML := activeDML + nextDDL := activeDDL + + if sinceAdvance >= soft || successRate < 0.10 { + if nextDDL > 1 { + nextDDL-- + return autoTuneResult{nextDML: nextDML, nextDDL: nextDDL} + } + if nextDML > 1 { + nextDML -= 8 + if nextDML < 1 { + nextDML = 1 + } + } + return autoTuneResult{nextDML: nextDML, nextDDL: nextDDL} + } + + if nextDML < maxDML { + nextDML += 8 + if nextDML > maxDML { + nextDML = maxDML + } + } + if nextDDL < maxDDL && sinceAdvance < soft/2 { + nextDDL++ + } + return autoTuneResult{nextDML: nextDML, nextDDL: nextDDL} +} diff --git a/tests/utils/random_ddl_test_runner/autotune_test.go b/tests/utils/random_ddl_test_runner/autotune_test.go new file mode 100644 index 0000000000..67543b2821 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/autotune_test.go @@ -0,0 +1,52 @@ +package main + +import ( + "testing" + "time" +) + +func TestAutoTuneStep_DegradeOnHardStall(t *testing.T) { + res := autoTuneStep(10*time.Minute, 1.0, 64, 4, 128, 8, 2*time.Minute, 5*time.Minute) + if !res.fail { + t.Fatalf("expected fail on hard stall") + } +} + +func TestAutoTuneStep_DegradeDDLFirst(t *testing.T) { + res := autoTuneStep(3*time.Minute, 1.0, 64, 4, 128, 8, 2*time.Minute, 5*time.Minute) + if res.fail { + t.Fatalf("unexpected fail") + } + if res.nextDDL != 3 { + t.Fatalf("expected ddl decrease first, got %d", res.nextDDL) + } + if res.nextDML != 64 { + t.Fatalf("expected dml unchanged, got %d", res.nextDML) + } +} + +func TestAutoTuneStep_DegradeDMLWhenDDLMin(t *testing.T) { + res := autoTuneStep(3*time.Minute, 1.0, 16, 1, 128, 8, 2*time.Minute, 5*time.Minute) + if res.fail { + t.Fatalf("unexpected fail") + } + if res.nextDDL != 1 { + t.Fatalf("expected ddl unchanged at min, got %d", res.nextDDL) + } + if res.nextDML >= 16 { + t.Fatalf("expected dml decreased, got %d", res.nextDML) + } +} + +func TestAutoTuneStep_IncreaseWhenHealthy(t *testing.T) { + res := autoTuneStep(10*time.Second, 0.9, 16, 1, 32, 4, 2*time.Minute, 5*time.Minute) + if res.fail { + t.Fatalf("unexpected fail") + } + if res.nextDML <= 16 { + t.Fatalf("expected dml increased, got %d", res.nextDML) + } + if res.nextDDL != 2 { + t.Fatalf("expected ddl increased, got %d", res.nextDDL) + } +} diff --git a/tests/utils/random_ddl_test_runner/bootstrap.go b/tests/utils/random_ddl_test_runner/bootstrap.go new file mode 100644 index 0000000000..dd20b4128d --- /dev/null +++ b/tests/utils/random_ddl_test_runner/bootstrap.go @@ -0,0 +1,210 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" +) + +func (r *runner) bootstrap() error { + // bootstrap creates an identical starting point on upstream and downstream. + // + // Rationale: + // - The workload phase only writes to upstream. Downstream changes must come from TiCDC replication. + // - A deterministic baseline makes end-to-end diffs and triage reproducible (seeded by cfg.Seed). + ctx := context.Background() + r.logger.Printf("bootstrap start: workdir=%s profile=%s", r.cfg.Workdir, r.cfg.Profile) + + up, err := openMySQL(ctx, r.cfg.Upstream) + if err != nil { + return err + } + defer func() { _ = up.Close() }() + + down, err := openMySQL(ctx, r.cfg.Downstream) + if err != nil { + return err + } + defer func() { _ = down.Close() }() + + model := buildInitialModel(r.cfg) + + for _, dbName := range model.dbs { + if err := execBoth(ctx, up, down, + fmt.Sprintf("DROP DATABASE IF EXISTS `%s`", dbName), + fmt.Sprintf("CREATE DATABASE `%s`", dbName), + ); err != nil { + return err + } + } + + for _, tbl := range model.tables { + createSQL := tbl.schema.createTableSQL(tbl.db, tbl.name) + if err := execBoth(ctx, up, down, createSQL); err != nil { + return err + } + } + + // Deny region merge on split candidate tables to keep region pressure stable. + for _, tbl := range model.splitTables { + attrsSQL := fmt.Sprintf("ALTER TABLE %s ATTRIBUTES 'merge_option=deny'", tbl.fqName()) + if err := execBoth(ctx, up, down, attrsSQL); err != nil { + return err + } + } + + baseRows := r.cfg.Bootstrap.BaseRowsPerTable + splitRows := r.cfg.Bootstrap.SplitRowsPerTable + + for _, tbl := range model.tables { + rows := baseRows + if tbl.domain == domainSplit { + rows += splitRows + } + if err := insertInitialRows(ctx, up, down, tbl, rows); err != nil { + return err + } + } + + r.logger.Printf("bootstrap done") + return nil +} + +func execBoth(ctx context.Context, up, down *sql.DB, stmts ...string) error { + for _, s := range stmts { + if _, err := up.ExecContext(ctx, s); err != nil { + return err + } + if _, err := down.ExecContext(ctx, s); err != nil { + return err + } + } + return nil +} + +func insertInitialRows(ctx context.Context, up, down *sql.DB, tbl *table, rows int) error { + tbl.mu.Lock() + schema := tbl.schema.clone() + tbl.mu.Unlock() + + // Use placeholders for values to keep SQL ASCII-only while allowing any binary/JSON payloads. + var cols []column + for _, c := range schema.columns { + if c.generated != "" { + continue + } + cols = append(cols, c) + } + + colNames := make([]string, 0, len(cols)) + for _, c := range cols { + colNames = append(colNames, c.name) + } + + const batchSize = 200 + for start := 1; start <= rows; start += batchSize { + end := start + batchSize - 1 + if end > rows { + end = rows + } + + var args []any + var valuesSQL strings.Builder + for i := start; i <= end; i++ { + if i > start { + valuesSQL.WriteString(",") + } + valuesSQL.WriteString("(") + for j := range cols { + if j > 0 { + valuesSQL.WriteString(",") + } + valuesSQL.WriteString("?") + } + valuesSQL.WriteString(")") + + rowArgs := buildDeterministicRowArgs(tbl, cols, int64(i)) + args = append(args, rowArgs...) + } + + stmt := fmt.Sprintf("INSERT INTO %s (%s) VALUES %s", + tbl.fqName(), + backtickJoin(colNames), + valuesSQL.String(), + ) + if _, err := up.ExecContext(ctx, stmt, args...); err != nil { + return err + } + if _, err := down.ExecContext(ctx, stmt, args...); err != nil { + return err + } + } + return nil +} + +func backtickJoin(cols []string) string { + quoted := make([]string, 0, len(cols)) + for _, c := range cols { + quoted = append(quoted, fmt.Sprintf("`%s`", c)) + } + return strings.Join(quoted, ",") +} + +func buildDeterministicRowArgs(tbl *table, cols []column, rowID int64) []any { + // Deterministic values make bootstrap reproducible. Avoid non-ASCII in the SQL text by + // passing bytes/JSON via placeholders rather than embedding literals into the statement. + args := make([]any, 0, len(cols)) + for _, c := range cols { + switch strings.ToUpper(c.typ.base) { + case "BIGINT": + if c.name == "id" { + args = append(args, rowID) + } else { + args = append(args, deterministicInt64(rowID)) + } + case "INT": + if c.name == "a" { + args = append(args, int32(rowID)) + } else if c.name == "v" { + args = append(args, int32(rowID%1000)) + } else { + args = append(args, int32(deterministicInt64(rowID)%mathMaxInt32())) + } + case "VARCHAR": + if c.name == "pad" { + args = append(args, strings.Repeat("x", 256)) + } else { + args = append(args, asciiStringFromID(fmt.Sprintf("%s_%s", tbl.name, c.name), rowID)) + } + case "DATETIME": + args = append(args, deterministicTime(rowID)) + case "DECIMAL": + args = append(args, deterministicDecimal(rowID)) + case "JSON": + args = append(args, fmt.Sprintf("{\"id\":%d,\"table\":\"%s\"}", rowID, tbl.name)) + case "VARBINARY": + // Keep bytes deterministic; the SQL text remains ASCII-only due to placeholders. + args = append(args, []byte(fmt.Sprintf("%064x", rowID))) + default: + args = append(args, nil) + } + } + return args +} + +func mathMaxInt32() int64 { + return int64(^uint32(0) >> 1) +} + +func sleepWithContext(ctx context.Context, d time.Duration) error { + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + return nil + } +} diff --git a/tests/utils/random_ddl_test_runner/config.go b/tests/utils/random_ddl_test_runner/config.go new file mode 100644 index 0000000000..ef061e2935 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/config.go @@ -0,0 +1,329 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "time" +) + +type duration struct { + time.Duration +} + +func (d *duration) UnmarshalJSON(b []byte) error { + var s string + if err := json.Unmarshal(b, &s); err != nil { + return err + } + parsed, err := time.ParseDuration(s) + if err != nil { + return err + } + d.Duration = parsed + return nil +} + +type mysqlConnConfig struct { + Host string `json:"host"` + Port int `json:"port"` + User string `json:"user"` + Password string `json:"password"` +} + +func (c mysqlConnConfig) dsn(database string) string { + // Keep DSN ASCII-only and deterministic. + return fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?charset=utf8mb4&parseTime=true&multiStatements=true&interpolateParams=true", + c.User, c.Password, c.Host, c.Port, database) +} + +type cdcAPIConfig struct { + Addr string `json:"addr"` + User string `json:"user"` + Password string `json:"password"` + Keyspace string `json:"keyspace"` + // ChangefeedID is required for the workload phase. + ChangefeedID string `json:"changefeed_id"` +} + +type failoverConfig struct { + Enabled bool `json:"enabled"` + CaptureAddrs []string `json:"capture_addrs"` + CdcBinary string `json:"cdc_binary"` + MinInterval duration `json:"min_interval"` + MaxInterval duration `json:"max_interval"` + GatedProbability float64 `json:"gated_probability"` +} + +type dmlConfig struct { + MaxWorkers int `json:"max_workers"` + InitialWorkers int `json:"initial_workers"` + HotspotRatio float64 `json:"hotspot_ratio"` + HotTableRatio float64 `json:"hot_table_ratio"` + BigTxnEnabled bool `json:"big_txn_enabled"` + BigTxnInterval duration `json:"big_txn_interval"` + BigTxnRowsMin int `json:"big_txn_rows_min"` + BigTxnRowsMax int `json:"big_txn_rows_max"` + KeyConflictEnabled bool `json:"key_conflict_enabled"` + KeyConflictKeyspace int `json:"key_conflict_keyspace"` +} + +type ddlConfig struct { + MaxWorkers int `json:"max_workers"` + InitialWorkers int `json:"initial_workers"` +} + +type verifyConfig struct { + HealthInterval duration `json:"health_interval"` + NoAdvanceSoft duration `json:"no_advance_soft"` + NoAdvanceHard duration `json:"no_advance_hard"` + LogScanEnabled bool `json:"log_scan_enabled"` + PanicPatterns []string `json:"panic_patterns"` + FailOnPanicMatch bool `json:"fail_on_panic_match"` + ConvergeWait duration `json:"converge_wait"` + ConvergeTimeout duration `json:"converge_timeout"` +} + +type mysqlSyncpointConfig struct { + Enabled bool `json:"enabled"` + DiffInterval duration `json:"diff_interval"` + MaxDiffChecks int `json:"max_diff_checks"` + UpstreamStatusHost string `json:"upstream_status_host"` + UpstreamStatusPort int `json:"upstream_status_port"` +} + +type config struct { + Workdir string `json:"workdir"` + Profile string `json:"profile"` + Seed int64 `json:"seed"` + Duration duration `json:"duration"` + + Upstream mysqlConnConfig `json:"upstream"` + Downstream mysqlConnConfig `json:"downstream"` + + CDC cdcAPIConfig `json:"cdc"` + SinkType string `json:"sink_type"` + Failover failoverConfig `json:"failover"` + DML dmlConfig `json:"dml"` + DDL ddlConfig `json:"ddl"` + Verify verifyConfig `json:"verify"` + MySQL mysqlSyncpointConfig `json:"mysql"` + + Bootstrap struct { + DBCount int `json:"db_count"` + TablesPerDB int `json:"tables_per_db"` + BaseRowsPerTable int `json:"base_rows_per_table"` + SplitRowsPerTable int `json:"split_rows_per_table"` + FrozenRowsPerTable int `json:"frozen_rows_per_table"` + } `json:"bootstrap"` +} + +func loadConfig(path string) (*config, error) { + b, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg config + if err := json.Unmarshal(b, &cfg); err != nil { + return nil, err + } + if err := cfg.applyDefaultsAndValidate(); err != nil { + return nil, err + } + return &cfg, nil +} + +func (c *config) applyDefaultsAndValidate() error { + // Defaults should keep the smoke profile fast while still exercising key paths. + // "weekly" uses larger concurrency / data volumes to increase coverage. + if c.Workdir == "" { + return fmt.Errorf("workdir is required") + } + if c.Seed == 0 { + // Allow explicit 0 but require deterministic default. + c.Seed = 1 + } + if c.Duration.Duration <= 0 { + c.Duration.Duration = 3 * time.Minute + } + if c.Profile == "" { + c.Profile = "smoke" + } + + if c.Upstream.Host == "" { + c.Upstream.Host = "127.0.0.1" + } + if c.Upstream.Port == 0 { + c.Upstream.Port = 4000 + } + if c.Upstream.User == "" { + c.Upstream.User = "root" + } + + if c.Downstream.Host == "" { + c.Downstream.Host = "127.0.0.1" + } + if c.Downstream.Port == 0 { + c.Downstream.Port = 3306 + } + if c.Downstream.User == "" { + c.Downstream.User = "root" + } + + if c.Bootstrap.DBCount == 0 { + c.Bootstrap.DBCount = 5 + } + if c.Bootstrap.TablesPerDB == 0 { + c.Bootstrap.TablesPerDB = 20 + } + if c.Bootstrap.BaseRowsPerTable == 0 { + if c.Profile == "weekly" { + c.Bootstrap.BaseRowsPerTable = 1000 + } else { + c.Bootstrap.BaseRowsPerTable = 100 + } + } + if c.Bootstrap.SplitRowsPerTable == 0 { + if c.Profile == "weekly" { + c.Bootstrap.SplitRowsPerTable = 5000 + } else { + c.Bootstrap.SplitRowsPerTable = 500 + } + } + if c.Bootstrap.FrozenRowsPerTable == 0 { + c.Bootstrap.FrozenRowsPerTable = 50 + } + + if c.DML.MaxWorkers == 0 { + if c.Profile == "weekly" { + c.DML.MaxWorkers = 128 + } else { + c.DML.MaxWorkers = 32 + } + } + if c.DML.InitialWorkers == 0 { + c.DML.InitialWorkers = c.DML.MaxWorkers / 2 + if c.DML.InitialWorkers < 1 { + c.DML.InitialWorkers = 1 + } + } + if c.DML.HotspotRatio == 0 { + c.DML.HotspotRatio = 0.8 + } + if c.DML.HotTableRatio == 0 { + c.DML.HotTableRatio = 0.1 + } + if c.DML.BigTxnInterval.Duration == 0 { + c.DML.BigTxnInterval.Duration = 20 * time.Second + } + // Keep these enabled by default to ensure the workload includes the key motifs from the design doc. + c.DML.BigTxnEnabled = true + c.DML.KeyConflictEnabled = true + if c.DML.BigTxnRowsMin == 0 { + c.DML.BigTxnRowsMin = 200 + } + if c.DML.BigTxnRowsMax == 0 { + c.DML.BigTxnRowsMax = 400 + } + if c.DML.KeyConflictKeyspace == 0 { + c.DML.KeyConflictKeyspace = 1024 + } + + if c.DDL.MaxWorkers == 0 { + if c.Profile == "weekly" { + c.DDL.MaxWorkers = 8 + } else { + c.DDL.MaxWorkers = 2 + } + } + if c.DDL.InitialWorkers == 0 { + c.DDL.InitialWorkers = c.DDL.MaxWorkers / 2 + if c.DDL.InitialWorkers < 1 { + c.DDL.InitialWorkers = 1 + } + } + + if c.Verify.HealthInterval.Duration == 0 { + c.Verify.HealthInterval.Duration = 10 * time.Second + } + if c.Verify.NoAdvanceSoft.Duration == 0 { + c.Verify.NoAdvanceSoft.Duration = 2 * time.Minute + } + if c.Verify.NoAdvanceHard.Duration == 0 { + if c.Profile == "weekly" { + c.Verify.NoAdvanceHard.Duration = 15 * time.Minute + } else { + c.Verify.NoAdvanceHard.Duration = 5 * time.Minute + } + } + if c.Verify.ConvergeWait.Duration == 0 { + c.Verify.ConvergeWait.Duration = 20 * time.Second + } + if c.Verify.ConvergeTimeout.Duration == 0 { + c.Verify.ConvergeTimeout.Duration = c.Verify.NoAdvanceHard.Duration * 2 + if c.Verify.ConvergeTimeout.Duration < 2*time.Minute { + c.Verify.ConvergeTimeout.Duration = 2 * time.Minute + } + } + if len(c.Verify.PanicPatterns) == 0 { + c.Verify.PanicPatterns = []string{"panic", "fatal", "DATA RACE"} + } + if !c.Verify.LogScanEnabled { + c.Verify.LogScanEnabled = true + } + if !c.Verify.FailOnPanicMatch { + c.Verify.FailOnPanicMatch = true + } + + if c.Failover.MinInterval.Duration == 0 { + c.Failover.MinInterval.Duration = 20 * time.Second + } + if c.Failover.MaxInterval.Duration == 0 { + c.Failover.MaxInterval.Duration = 40 * time.Second + } + if c.Failover.GatedProbability == 0 { + c.Failover.GatedProbability = 0.5 + } + if c.Failover.CdcBinary == "" { + c.Failover.CdcBinary = "cdc.test" + } + if c.CDC.Keyspace == "" { + c.CDC.Keyspace = "default" + } + if c.CDC.User == "" { + c.CDC.User = "ticdc" + } + if c.CDC.Password == "" { + c.CDC.Password = "ticdc_secret" + } + if c.CDC.Addr == "" { + c.CDC.Addr = "127.0.0.1:8300" + } + + if c.MySQL.DiffInterval.Duration == 0 { + c.MySQL.DiffInterval.Duration = 2 * time.Minute + } + if c.MySQL.MaxDiffChecks == 0 { + c.MySQL.MaxDiffChecks = 1 + } + if c.MySQL.UpstreamStatusHost == "" { + c.MySQL.UpstreamStatusHost = "127.0.0.1" + } + if c.MySQL.UpstreamStatusPort == 0 { + c.MySQL.UpstreamStatusPort = 10080 + } + + // Basic validation. + // Keep the schema shape stable so that: + // - bootstrap can create a predictable workload surface, + // - the integration scripts can pre-create storage sink directories deterministically, + // - the model can assume fixed database/table sets. + if c.Bootstrap.DBCount != 5 { + return fmt.Errorf("db_count must be 5 to match the design doc, got %d", c.Bootstrap.DBCount) + } + if c.Bootstrap.TablesPerDB != 20 { + return fmt.Errorf("tables_per_db must be 20 to match the design doc, got %d", c.Bootstrap.TablesPerDB) + } + + return nil +} diff --git a/tests/utils/random_ddl_test_runner/db.go b/tests/utils/random_ddl_test_runner/db.go new file mode 100644 index 0000000000..f89288786b --- /dev/null +++ b/tests/utils/random_ddl_test_runner/db.go @@ -0,0 +1,39 @@ +package main + +import ( + "context" + "database/sql" + "time" + + _ "github.com/go-sql-driver/mysql" +) + +func openMySQL(ctx context.Context, cfg mysqlConnConfig) (*sql.DB, error) { + return openMySQLWithExtraParams(ctx, cfg, "") +} + +func openMySQLWithExtraParams(ctx context.Context, cfg mysqlConnConfig, extraParams string) (*sql.DB, error) { + dsn := cfg.dsn("") + if extraParams != "" { + dsn += "&" + extraParams + } + return openMySQLWithDSN(ctx, dsn) +} + +func openMySQLWithDSN(ctx context.Context, dsn string) (*sql.DB, error) { + db, err := sql.Open("mysql", dsn) + if err != nil { + return nil, err + } + db.SetMaxOpenConns(128) + db.SetMaxIdleConns(128) + db.SetConnMaxLifetime(5 * time.Minute) + + pingCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + if err := db.PingContext(pingCtx); err != nil { + _ = db.Close() + return nil, err + } + return db, nil +} diff --git a/tests/utils/random_ddl_test_runner/ddl.go b/tests/utils/random_ddl_test_runner/ddl.go new file mode 100644 index 0000000000..a0347cacde --- /dev/null +++ b/tests/utils/random_ddl_test_runner/ddl.go @@ -0,0 +1,476 @@ +package main + +import ( + "fmt" + "math/rand" + "strings" +) + +type ddlKind struct { + // name is used for logging and selector tracking. + name string + domain domain + baseWeight float64 + // lossy marks DDLs that can drop data or schema information (e.g., DROP/TRUNCATE/DROP COLUMN). + // These are still useful for coverage, but are typically constrained to churn domain. + lossy bool + + // gen returns: + // - sql: the DDL statement to execute on upstream. + // - apply: a callback that mutates the in-memory model when and only when the DDL succeeds. + gen func(rng *rand.Rand, t *table) (sql string, apply func()) +} + +func defaultDDLKinds() []ddlKind { + // DDL kinds are grouped by domain: + // - stable: schema changes that are relatively friendly to snapshot-based diffing. + // - churn: destructive or fragile DDLs that can invalidate snapshot reads and diff configs. + // - split_candidate: a subset used to stress split/region pressure with larger tables. + return []ddlKind{ + { + name: "add_column", + domain: domainStable, + baseWeight: 5, + gen: genAddColumn, + }, + { + name: "add_index", + domain: domainStable, + baseWeight: 4, + gen: genAddIndex, + }, + { + name: "convert_charset", + domain: domainStable, + baseWeight: 1, + gen: genConvertCharset, + }, + { + name: "add_partition", + domain: domainStable, + baseWeight: 1, + gen: genAddPartition, + }, + + // Note: Periodic MySQL syncpoint diffs rely on snapshot reads via sync_diff_inspector. + // Some schema-changing DDLs (e.g., DROP COLUMN / MODIFY COLUMN / DROP INDEX) are + // known to be fragile for snapshot-based diffing. Keep them in churn domain to + // preserve periodic diff stability, while still exercising these DDLs in the workload. + { + name: "drop_column", + domain: domainChurn, + baseWeight: 2, + lossy: true, + gen: genDropColumn, + }, + { + name: "modify_column_type", + domain: domainChurn, + baseWeight: 1, + lossy: true, + gen: genModifyColumnType, + }, + { + name: "drop_index", + domain: domainChurn, + baseWeight: 1, + gen: genDropIndex, + }, + + // Split candidate tables: use the same DDL set as stable, but target split domain. + { + name: "split_add_index", + domain: domainSplit, + baseWeight: 2, + gen: genAddIndex, + }, + { + name: "split_drop_index", + domain: domainSplit, + baseWeight: 1, + gen: genDropIndex, + }, + { + name: "split_add_partition", + domain: domainSplit, + baseWeight: 1, + gen: genAddPartition, + }, + + // Churn domain: destructive operations. + { + name: "truncate_table", + domain: domainChurn, + baseWeight: 2, + lossy: true, + gen: genTruncateTable, + }, + { + name: "drop_table", + domain: domainChurn, + baseWeight: 1, + lossy: true, + gen: genDropTable, + }, + // Do not include RECOVER TABLE in the default CDC random DDL set. + // It restores table data and schema from TiDB's local DDL history / GC + // snapshot state, so executing the same text on the downstream can recover + // a different historical table when names are reused. The recovered rows are + // not emitted as new DML either, making the operation unsafe for this suite. + { + name: "drop_and_recreate_table", + domain: domainChurn, + baseWeight: 1, + lossy: true, + gen: genDropAndRecreateTable, + }, + { + name: "rename_table", + domain: domainChurn, + baseWeight: 1, + gen: genRenameTable, + }, + } +} + +func genAddColumn(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + if len(t.schema.columns) > 32 { + return "", nil + } + newName := fmt.Sprintf("c_%d", rng.Intn(10_000_000)) + for _, c := range t.schema.columns { + if c.name == newName { + return "", nil + } + } + typ := randDDLColType(rng) + def := "0" + if strings.EqualFold(typ.base, "VARCHAR") { + def = "''" + } + sql := fmt.Sprintf("ALTER TABLE %s ADD COLUMN `%s` %s NOT NULL DEFAULT %s", + t.fqName(), newName, typ.sql(), def) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.schema.columns = append(t.schema.columns, column{ + name: newName, + typ: typ, + nullable: false, + defaultSQL: def, + }) + } + return sql, apply +} + +func genDropColumn(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + var candidates []int + for i, c := range t.schema.columns { + if c.generated != "" { + continue + } + if c.name == "id" { + continue + } + if containsString(t.schema.primaryKey, c.name) { + continue + } + candidates = append(candidates, i) + } + if len(candidates) == 0 { + return "", nil + } + idx := candidates[rng.Intn(len(candidates))] + colName := t.schema.columns[idx].name + sql := fmt.Sprintf("ALTER TABLE %s DROP COLUMN `%s`", t.fqName(), colName) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + if idx >= len(t.schema.columns) || t.schema.columns[idx].name != colName { + // Best-effort: column order may have changed due to concurrent successful DDL. + for i, c := range t.schema.columns { + if c.name == colName { + idx = i + break + } + } + } + if idx >= len(t.schema.columns) || t.schema.columns[idx].name != colName { + return + } + t.schema.columns = append(t.schema.columns[:idx], t.schema.columns[idx+1:]...) + // Drop indexes referencing the column. + var newIdx []index + for _, ix := range t.schema.indexes { + if containsString(ix.columns, colName) { + continue + } + newIdx = append(newIdx, ix) + } + t.schema.indexes = newIdx + } + return sql, apply +} + +func genModifyColumnType(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + var candidates []int + for i, c := range t.schema.columns { + if c.generated != "" { + continue + } + if containsString(t.schema.primaryKey, c.name) { + continue + } + if strings.EqualFold(c.typ.base, "JSON") || strings.EqualFold(c.typ.base, "DATETIME") { + continue + } + candidates = append(candidates, i) + } + if len(candidates) == 0 { + return "", nil + } + idx := candidates[rng.Intn(len(candidates))] + colName := t.schema.columns[idx].name + newTyp := randDDLColType(rng) + sql := fmt.Sprintf("ALTER TABLE %s MODIFY COLUMN `%s` %s NOT NULL", + t.fqName(), colName, newTyp.sql()) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + for i := range t.schema.columns { + if t.schema.columns[i].name == colName { + t.schema.columns[i].typ = newTyp + return + } + } + } + return sql, apply +} + +func genAddIndex(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + var cols []string + for _, c := range t.schema.columns { + if c.generated != "" { + continue + } + if strings.EqualFold(c.typ.base, "JSON") { + continue + } + cols = append(cols, c.name) + } + if len(cols) == 0 { + return "", nil + } + col := cols[rng.Intn(len(cols))] + name := fmt.Sprintf("idx_%s_%d", col, rng.Intn(10_000)) + for _, ix := range t.schema.indexes { + if ix.name == name { + return "", nil + } + } + sql := fmt.Sprintf("CREATE INDEX `%s` ON %s (`%s`)", name, t.fqName(), col) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.schema.indexes = append(t.schema.indexes, index{name: name, columns: []string{col}, unique: false}) + } + return sql, apply +} + +func genDropIndex(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + if len(t.schema.indexes) == 0 { + return "", nil + } + ix := t.schema.indexes[rng.Intn(len(t.schema.indexes))] + sql := fmt.Sprintf("DROP INDEX `%s` ON %s", ix.name, t.fqName()) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + var newIdx []index + for _, x := range t.schema.indexes { + if x.name == ix.name { + continue + } + newIdx = append(newIdx, x) + } + t.schema.indexes = newIdx + } + return sql, apply +} + +func genConvertCharset(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + targetCharset := "utf8mb4" + targetCollate := "utf8mb4_bin" + if !strings.EqualFold(t.schema.charset, "gbk") && rng.Intn(2) == 0 { + targetCharset = "gbk" + targetCollate = "gbk_bin" + } + sql := fmt.Sprintf("ALTER TABLE %s CONVERT TO CHARACTER SET %s COLLATE %s", t.fqName(), targetCharset, targetCollate) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.schema.charset = targetCharset + t.schema.collation = targetCollate + } + return sql, apply +} + +func genAddPartition(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + if !strings.Contains(strings.ToUpper(t.schema.partitionSQL), "RANGE") { + return "", nil + } + if t.rangePartitionNextID <= 0 || t.rangePartitionNextBound <= 0 { + return "", nil + } + pid := t.rangePartitionNextID + nextBound := t.rangePartitionNextBound + 1_000_000_000_000 + name := fmt.Sprintf("p%d", pid) + sql := fmt.Sprintf("ALTER TABLE %s ADD PARTITION (PARTITION %s VALUES LESS THAN (%d))", + t.fqName(), name, nextBound) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.rangePartitionNextID++ + t.rangePartitionNextBound = nextBound + } + return sql, apply +} + +func genTruncateTable(_ *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + sql := fmt.Sprintf("TRUNCATE TABLE %s", t.fqName()) + return sql, func() {} +} + +func genDropTable(_ *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + sql := fmt.Sprintf("DROP TABLE IF EXISTS %s", t.fqName()) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.exists = false + } + return sql, apply +} + +func genRecoverTable(_ *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if t.exists { + return "", nil + } + sql := fmt.Sprintf("RECOVER TABLE %s", t.fqName()) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.exists = true + } + return sql, apply +} + +func genDropAndRecreateTable(_ *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + sql := fmt.Sprintf("DROP TABLE IF EXISTS %s; %s", + t.fqName(), + t.initialSchema.createTableSQL(t.db, t.name), + ) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.exists = true + t.schema = t.initialSchema.clone() + } + return sql, apply +} + +func genRenameTable(rng *rand.Rand, t *table) (string, func()) { + t.mu.Lock() + defer t.mu.Unlock() + if !t.exists { + return "", nil + } + + // Keep the rename logic simple: rename each churn table at most once. + // This avoids creating long chains of renamed tables and reduces the chance of + // duplicate rename DDLs under failover. + if strings.Contains(t.name, "_r_") { + return "", nil + } + + newName := fmt.Sprintf("%s_r_%d", t.name, rng.Intn(10_000_000)) + // The max length of a TiDB table name is 64. Keep a small margin. + if len(newName) > 60 { + return "", nil + } + sql := fmt.Sprintf("RENAME TABLE %s TO `%s`.`%s`", t.fqName(), t.db, newName) + apply := func() { + t.mu.Lock() + defer t.mu.Unlock() + t.name = newName + } + return sql, apply +} + +func randDDLColType(rng *rand.Rand) colType { + switch rng.Intn(3) { + case 0: + return colType{base: "INT"} + case 1: + return colType{base: "BIGINT"} + default: + return colType{base: "VARCHAR", varcharN: 64} + } +} + +func containsString(ss []string, s string) bool { + for _, x := range ss { + if x == s { + return true + } + } + return false +} diff --git a/tests/utils/random_ddl_test_runner/ddl_test.go b/tests/utils/random_ddl_test_runner/ddl_test.go new file mode 100644 index 0000000000..3872d7bd9a --- /dev/null +++ b/tests/utils/random_ddl_test_runner/ddl_test.go @@ -0,0 +1,60 @@ +package main + +import ( + "math/rand" + "strings" + "testing" +) + +func TestGenDropColumn_DoesNotDropPrimaryKey(t *testing.T) { + tbl := &table{ + db: "db1", + name: "t00", + schema: tableSchema{ + columns: []column{ + {name: "id", typ: colType{base: "BIGINT"}, nullable: false}, + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "VARCHAR", varcharN: 64}, nullable: false}, + }, + primaryKey: []string{"id"}, + }, + exists: true, + } + rng := rand.New(rand.NewSource(1)) + sqlText, _ := genDropColumn(rng, tbl) + if sqlText == "" { + t.Fatalf("expected a ddl statement") + } + if strings.Contains(sqlText, "`id`") { + t.Fatalf("expected not to drop pk column, sql=%s", sqlText) + } +} + +func TestGenAddPartition_RequiresRangePartition(t *testing.T) { + tbl := &table{ + db: "db1", + name: "t07", + schema: tableSchema{ + columns: []column{{name: "id", typ: colType{base: "BIGINT"}, nullable: false}}, + }, + exists: true, + } + rng := rand.New(rand.NewSource(1)) + sqlText, _ := genAddPartition(rng, tbl) + if sqlText != "" { + t.Fatalf("expected empty ddl for non-partitioned table, got %s", sqlText) + } +} + +func TestDefaultDDLKindsExcludeRecoverTable(t *testing.T) { + for _, kind := range defaultDDLKinds() { + if kind.name == "recover_table" { + t.Fatalf("recover_table should not be enabled by default") + } + } + + sqlText, apply := genRecoverTable(rand.New(rand.NewSource(1)), &table{db: "db1", name: "t1"}) + if sqlText == "" || apply == nil { + t.Fatalf("recover_table generator should remain available for explicit tests") + } +} diff --git a/tests/utils/random_ddl_test_runner/ddl_worker.go b/tests/utils/random_ddl_test_runner/ddl_worker.go new file mode 100644 index 0000000000..24bdcbeb0d --- /dev/null +++ b/tests/utils/random_ddl_test_runner/ddl_worker.go @@ -0,0 +1,165 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "log" + "math/rand" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" +) + +type ddlTrace struct { + mu sync.Mutex + file *os.File + log *log.Logger +} + +func newDDLTrace(workdir string) (*ddlTrace, error) { + if err := os.MkdirAll(workdir, 0o755); err != nil { + return nil, err + } + f, err := os.OpenFile(filepath.Join(workdir, "ddl_trace.log"), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return nil, err + } + return &ddlTrace{ + file: f, + log: log.New(f, "", log.LstdFlags|log.Lmicroseconds|log.LUTC), + }, nil +} + +func (t *ddlTrace) close() { + if t == nil || t.file == nil { + return + } + _ = t.file.Close() +} + +func (t *ddlTrace) record(kind string, target string, sql string, err error) { + if t == nil { + return + } + t.mu.Lock() + defer t.mu.Unlock() + status := "ok" + msg := "" + if err != nil { + status = "err" + msg = err.Error() + } + // Avoid printing raw SQL for DML here; DDL is expected ASCII-only. + t.log.Printf("kind=%s target=%s status=%s sql=%q err=%q", kind, target, status, sql, msg) +} + +func ddlWorker( + ctx context.Context, + db *sql.DB, + model *clusterModel, + seed int64, + workerID int, + activeWorkers *int32, + selector *ddlSelector, + trace *ddlTrace, + logger *log.Logger, +) { + // ddlWorker is a best-effort DDL submitter. + // + // Concurrency control: + // - Spawn MaxWorkers goroutines, but only workers with workerID < activeWorkers are "active". + // - healthAndAutotuneLoop adjusts activeWorkers based on checkpoint liveness. + // + // Correctness model: + // - Each DDL kind returns (sql, apply). apply updates the in-memory model and is invoked only + // when the DDL succeeds, so subsequent DML/DDL generation can track schema evolution. + // - DDL failures are expected under concurrency (e.g., conflicts, missing tables) and do not + // stop the worker. + rng := rand.New(rand.NewSource(seed + int64(workerID))) + + for { + select { + case <-ctx.Done(): + return + default: + } + + if int32(workerID) >= atomic.LoadInt32(activeWorkers) { + _ = sleepWithContext(ctx, 500*time.Millisecond) + continue + } + + kind := selector.pick(rng) + var tbl *table + if kind.name == "recover_table" { + tbl = pickMissingTable(rng, model.churnTables) + } else { + tbl = model.pickTableForDomain(rng, kind.domain) + } + if tbl == nil || tbl.isMotif { + _ = sleepWithContext(ctx, 100*time.Millisecond) + continue + } + + sqlText, apply := kind.gen(rng, tbl) + if sqlText == "" || apply == nil { + _ = sleepWithContext(ctx, 100*time.Millisecond) + continue + } + + start := time.Now() + _, err := db.ExecContext(ctx, sqlText) + if err == nil { + apply() + selector.record(kind.name) + } + if logger != nil { + logger.Printf("ddl worker=%d kind=%s target=%s elapsed=%s err=%v", + workerID, kind.name, tbl.fqName(), time.Since(start), err) + } + if trace != nil { + trace.record(kind.name, tbl.fqName(), sqlText, err) + } + + // Keep DDL submission rate bounded. + _ = sleepWithContext(ctx, time.Second) + } +} + +func pickMissingTable(rng *rand.Rand, candidates []*table) *table { + if len(candidates) == 0 { + return nil + } + for i := 0; i < 10; i++ { + t := candidates[rng.Intn(len(candidates))] + t.mu.Lock() + exists := t.exists + t.mu.Unlock() + if !exists { + return t + } + } + return nil +} + +func ensureFileClosed(logger *log.Logger, t *ddlTrace) { + if t == nil { + return + } + t.close() + if logger != nil { + logger.Printf("ddl trace file closed") + } +} + +func writeFileAtomic(path string, data []byte) error { + dir := filepath.Dir(path) + tmp := filepath.Join(dir, fmt.Sprintf(".tmp-%d", time.Now().UnixNano())) + if err := os.WriteFile(tmp, data, 0o644); err != nil { + return err + } + return os.Rename(tmp, path) +} diff --git a/tests/utils/random_ddl_test_runner/dml.go b/tests/utils/random_ddl_test_runner/dml.go new file mode 100644 index 0000000000..8454b8c11d --- /dev/null +++ b/tests/utils/random_ddl_test_runner/dml.go @@ -0,0 +1,381 @@ +package main + +import ( + "context" + "database/sql" + "errors" + "fmt" + "math/rand" + "strings" + "sync/atomic" + "time" + + "github.com/go-sql-driver/mysql" +) + +type dmlCounters struct { + total uint64 + success uint64 + unknownTable uint64 + unknownColumn uint64 + duplicateEntry uint64 + otherError uint64 +} + +func (c *dmlCounters) record(err error) { + // DML errors are expected under concurrent DDL and are not fatal by themselves. + // Counters are used by the health loop to infer a success rate for auto-tuning. + atomic.AddUint64(&c.total, 1) + if err == nil { + atomic.AddUint64(&c.success, 1) + return + } + var me *mysql.MySQLError + if errors.As(err, &me) { + switch me.Number { + case 1146: + atomic.AddUint64(&c.unknownTable, 1) + return + case 1054: + atomic.AddUint64(&c.unknownColumn, 1) + return + case 1062: + atomic.AddUint64(&c.duplicateEntry, 1) + return + } + } + atomic.AddUint64(&c.otherError, 1) +} + +type dmlSnapshot struct { + Total uint64 `json:"total"` + Success uint64 `json:"success"` + UnknownTable uint64 `json:"unknown_table"` + UnknownColumn uint64 `json:"unknown_column"` + DuplicateEntry uint64 `json:"duplicate_entry"` + OtherError uint64 `json:"other_error"` +} + +func (c *dmlCounters) snapshot() dmlSnapshot { + return dmlSnapshot{ + Total: atomic.LoadUint64(&c.total), + Success: atomic.LoadUint64(&c.success), + UnknownTable: atomic.LoadUint64(&c.unknownTable), + UnknownColumn: atomic.LoadUint64(&c.unknownColumn), + DuplicateEntry: atomic.LoadUint64(&c.duplicateEntry), + OtherError: atomic.LoadUint64(&c.otherError), + } +} + +func dmlWorker( + ctx context.Context, + db *sql.DB, + model *clusterModel, + seed int64, + workerID int, + activeWorkers *int32, + cfg dmlConfig, + counters *dmlCounters, + motifStep *int32, +) { + // dmlWorker generates best-effort DML against upstream. + // + // Concurrency control: + // - Spawn MaxWorkers goroutines, but only workerID < activeWorkers are "active". + // - healthAndAutotuneLoop adjusts activeWorkers based on checkpoint liveness. + // + // Schema handling: + // - DML generation reads the table schema under lock and then executes outside the lock. + // - DDL may race with DML, so unknown table/column errors are tracked and tolerated. + rng := rand.New(rand.NewSource(seed + int64(workerID))) + + for { + select { + case <-ctx.Done(): + return + default: + } + + if int32(workerID) >= atomic.LoadInt32(activeWorkers) { + _ = sleepWithContext(ctx, 200*time.Millisecond) + continue + } + + tbl := model.pickTableForDML(rng, cfg.HotspotRatio) + var ( + stmt string + args []any + err error + ) + + if tbl.isMotif { + stmt, args, err = buildMotifDML(rng, tbl, atomic.LoadInt32(motifStep)) + } else { + stmt, args, err = buildGenericDML(rng, tbl) + } + if err != nil { + // Internal generation error; keep the worker alive. + counters.record(err) + _ = sleepWithContext(ctx, 50*time.Millisecond) + continue + } + if stmt == "" { + _ = sleepWithContext(ctx, 20*time.Millisecond) + continue + } + + _, execErr := db.ExecContext(ctx, stmt, args...) + counters.record(execErr) + } +} + +func buildGenericDML(rng *rand.Rand, tbl *table) (string, []any, error) { + tbl.mu.Lock() + defer tbl.mu.Unlock() + if !tbl.exists { + return "", nil, nil + } + + // Keep the overall mix stable: + // - INSERT is the dominant operation + // - UPDATE/DELETE provide mutation pressure + // + // For tables without a single-column primary key (keyless or composite PK), UPDATE/DELETE + // fall back to bounded operations using LIMIT 1 to avoid requiring key materialization. + switch rng.Intn(10) { + case 0, 1: + stmt, args, err := buildDeleteLocked(rng, tbl) + if stmt != "" || err != nil { + return stmt, args, err + } + case 2, 3: + stmt, args, err := buildUpdateLocked(rng, tbl) + if stmt != "" || err != nil { + return stmt, args, err + } + default: + // Fall through to INSERT. + } + return buildInsertLocked(rng, tbl, nil) +} + +func buildMotifDML(rng *rand.Rand, tbl *table, step int32) (string, []any, error) { + // Motif DML intentionally omits some columns to exercise default value drift + // and schema evolution patterns coordinated by motif.go. + tbl.mu.Lock() + defer tbl.mu.Unlock() + if !tbl.exists { + return "", nil, nil + } + + omit := map[string]struct{}{} + for _, c := range tbl.schema.columns { + if c.name == "site_code" { + // Always omit site_code to exercise default drift. + omit["site_code"] = struct{}{} + break + } + } + + // Before PK is added, focus on inserts to create rows before/after default drift. + if step < 3 { + return buildInsertLocked(rng, tbl, omit) + } + + // After PK is added (a, site_code), only update non-frozen rows inserted after default is unified. + if tbl.motifUnifiedStart > 0 && tbl.nextID > tbl.motifUnifiedStart && rng.Intn(4) == 0 { + return buildMotifUpdateAfterUnifiedLocked(rng, tbl) + } + return buildInsertLocked(rng, tbl, omit) +} + +func buildInsertLocked(rng *rand.Rand, tbl *table, omitCols map[string]struct{}) (string, []any, error) { + // Inserts are generated from a schema snapshot taken under table lock. + schema := tbl.schema.clone() + rowID := tbl.nextID + tbl.nextID++ + + var cols []column + for _, c := range schema.columns { + if c.generated != "" { + continue + } + if omitCols != nil { + if _, ok := omitCols[c.name]; ok { + continue + } + } + cols = append(cols, c) + } + if len(cols) == 0 { + return "", nil, nil + } + + colNames := make([]string, 0, len(cols)) + placeholders := make([]string, 0, len(cols)) + args := make([]any, 0, len(cols)) + for _, c := range cols { + colNames = append(colNames, c.name) + placeholders = append(placeholders, "?") + args = append(args, buildRandomValue(rng, tbl, c, rowID)) + } + + stmt := fmt.Sprintf("INSERT INTO %s (%s) VALUES (%s)", + tbl.fqName(), + backtickJoin(colNames), + strings.Join(placeholders, ","), + ) + return stmt, args, nil +} + +func buildUpdateLocked(rng *rand.Rand, tbl *table) (string, []any, error) { + schema := tbl.schema.clone() + + var candidates []column + for _, c := range schema.columns { + if c.generated != "" { + continue + } + if containsString(schema.primaryKey, c.name) { + continue + } + candidates = append(candidates, c) + } + + if len(schema.primaryKey) == 1 { + // Single-column PK: do targeted updates by PK to keep the operation deterministic. + pk := schema.primaryKey[0] + if tbl.nextID <= 1 { + return "", nil, nil + } + key := int64(rng.Intn(int(tbl.nextID-1)) + 1) + if len(candidates) == 0 { + // As a last resort, update the PK itself to still generate UPDATE traffic. + // Pick a new key different from the old one. + newKey := key + int64(rng.Intn(1024)+1) + stmt := fmt.Sprintf("UPDATE %s SET `%s`=? WHERE `%s`=?", + tbl.fqName(), + pk, + pk, + ) + return stmt, []any{newKey, key}, nil + } + col := candidates[rng.Intn(len(candidates))] + stmt := fmt.Sprintf("UPDATE %s SET `%s`=? WHERE `%s`=?", + tbl.fqName(), + col.name, + pk, + ) + args := []any{buildRandomValue(rng, tbl, col, key), key} + return stmt, args, nil + } + + // Keyless or composite PK tables: do a bounded update without relying on key materialization. + if len(candidates) == 0 { + for _, c := range schema.columns { + if c.generated != "" { + continue + } + candidates = append(candidates, c) + } + } + if len(candidates) == 0 { + return "", nil, nil + } + col := candidates[rng.Intn(len(candidates))] + rowID := tbl.nextID + if rowID <= 0 { + rowID = 1 + } + stmt := fmt.Sprintf("UPDATE %s SET `%s`=? LIMIT 1", + tbl.fqName(), + col.name, + ) + return stmt, []any{buildRandomValue(rng, tbl, col, rowID)}, nil +} + +func buildDeleteLocked(rng *rand.Rand, tbl *table) (string, []any, error) { + schema := tbl.schema.clone() + if len(schema.primaryKey) == 1 { + pk := schema.primaryKey[0] + if tbl.nextID <= 1 { + return "", nil, nil + } + key := int64(rng.Intn(int(tbl.nextID-1)) + 1) + stmt := fmt.Sprintf("DELETE FROM %s WHERE `%s`=?", tbl.fqName(), pk) + return stmt, []any{key}, nil + } + // Keyless or composite PK tables: do a bounded delete without requiring keys. + return fmt.Sprintf("DELETE FROM %s LIMIT 1", tbl.fqName()), nil, nil +} + +func buildMotifUpdateAfterUnifiedLocked(rng *rand.Rand, tbl *table) (string, []any, error) { + // This is only valid after PK evolution: PRIMARY KEY (a, site_code). + schema := tbl.schema.clone() + if len(schema.primaryKey) != 2 { + return "", nil, nil + } + if tbl.nextID <= tbl.motifUnifiedStart { + return "", nil, nil + } + a := int64(rng.Intn(int(tbl.nextID-tbl.motifUnifiedStart)) + int(tbl.motifUnifiedStart)) + + stmt := fmt.Sprintf("UPDATE %s SET `b`=? WHERE `a`=? AND `site_code`=''", + tbl.fqName(), + ) + return stmt, []any{int32(rng.Intn(1_000_000)), a}, nil +} + +func buildRandomValue(rng *rand.Rand, tbl *table, c column, rowID int64) any { + // Keep payloads deterministic enough for triage, and keep SQL text ASCII-only by using placeholders. + switch strings.ToUpper(c.typ.base) { + case "BIGINT": + if c.name == "id" { + return rowID + } + return deterministicInt64(rowID) + case "INT": + if c.name == "a" && tbl.isMotif { + return int32(rowID) + } + return int32(rng.Intn(1_000_000)) + case "VARCHAR": + if c.name == "pad" { + return strings.Repeat("x", 256) + } + return randASCII(rng, min(32, max(8, c.typ.varcharN/2))) + case "DATETIME": + return deterministicTime(rowID) + case "DECIMAL": + return deterministicDecimal(rowID) + case "JSON": + return fmt.Sprintf("{\"id\":%d,\"tbl\":\"%s\"}", rowID, tbl.name) + case "VARBINARY": + return []byte(fmt.Sprintf("%064x", rowID)) + default: + return nil + } +} + +func randASCII(rng *rand.Rand, n int) string { + const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + b := make([]byte, n) + for i := range b { + b[i] = letters[rng.Intn(len(letters))] + } + return string(b) +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/tests/utils/random_ddl_test_runner/dml_test.go b/tests/utils/random_ddl_test_runner/dml_test.go new file mode 100644 index 0000000000..c448cf721b --- /dev/null +++ b/tests/utils/random_ddl_test_runner/dml_test.go @@ -0,0 +1,48 @@ +package main + +import ( + "math/rand" + "strings" + "testing" +) + +func TestRandASCII_IsASCIIAlphaNum(t *testing.T) { + rng := rand.New(rand.NewSource(1)) + s := randASCII(rng, 128) + for i := 0; i < len(s); i++ { + b := s[i] + isNum := b >= '0' && b <= '9' + isLower := b >= 'a' && b <= 'z' + isUpper := b >= 'A' && b <= 'Z' + if !(isNum || isLower || isUpper) { + t.Fatalf("unexpected byte %q in %q", b, s) + } + } +} + +func TestMotifInsert_OmitsSiteCode(t *testing.T) { + tbl := &table{ + db: "db1", + name: "t03", + isMotif: true, + schema: tableSchema{ + columns: []column{ + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "INT"}, nullable: false}, + {name: "site_code", typ: colType{base: "VARCHAR", varcharN: 64}, nullable: false, defaultSQL: "'100'"}, + }, + primaryKey: []string{"a", "site_code"}, + }, + exists: true, + nextID: 100, + frozen: map[int64]struct{}{}, + } + rng := rand.New(rand.NewSource(1)) + stmt, _, err := buildMotifDML(rng, tbl, 3) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if strings.Contains(stmt, "site_code") { + t.Fatalf("expected site_code to be omitted, stmt=%s", stmt) + } +} diff --git a/tests/utils/random_ddl_test_runner/extra_workers.go b/tests/utils/random_ddl_test_runner/extra_workers.go new file mode 100644 index 0000000000..d446713dfc --- /dev/null +++ b/tests/utils/random_ddl_test_runner/extra_workers.go @@ -0,0 +1,185 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "math/rand" + "strings" + "sync/atomic" + "time" +) + +func bigTxnWorker( + ctx context.Context, + db *sql.DB, + model *clusterModel, + seed int64, + cfg dmlConfig, + activeWorkers *int32, +) { + // bigTxnWorker periodically runs large insert transactions to stress: + // - large message paths for MQ sinks, + // - large commit and apply paths for MySQL sink. + if !cfg.BigTxnEnabled || cfg.BigTxnInterval.Duration <= 0 { + return + } + rng := rand.New(rand.NewSource(seed)) + ticker := time.NewTicker(cfg.BigTxnInterval.Duration) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + // When the overall DML pool is heavily degraded, skip big transactions to help recovery. + if atomic.LoadInt32(activeWorkers) <= 4 { + continue + } + + if len(model.splitTables) == 0 { + continue + } + tbl := model.splitTables[rng.Intn(len(model.splitTables))] + rows := cfg.BigTxnRowsMin + if cfg.BigTxnRowsMax > cfg.BigTxnRowsMin { + rows = cfg.BigTxnRowsMin + rng.Intn(cfg.BigTxnRowsMax-cfg.BigTxnRowsMin+1) + } + _ = runBigInsertTxn(ctx, db, tbl, rows) + } +} + +func runBigInsertTxn(ctx context.Context, db *sql.DB, tbl *table, rows int) error { + // Build a single multi-row INSERT inside a transaction to create a "big txn" workload. + tbl.mu.Lock() + if !tbl.exists { + tbl.mu.Unlock() + return nil + } + schema := tbl.schema.clone() + startID := tbl.nextID + tbl.nextID += int64(rows) + tbl.mu.Unlock() + + var cols []column + for _, c := range schema.columns { + if c.generated != "" { + continue + } + cols = append(cols, c) + } + if len(cols) == 0 { + return nil + } + + colNames := make([]string, 0, len(cols)) + for _, c := range cols { + colNames = append(colNames, c.name) + } + + var valuesSQL strings.Builder + var args []any + for i := 0; i < rows; i++ { + if i > 0 { + valuesSQL.WriteString(",") + } + valuesSQL.WriteString("(") + for j := range cols { + if j > 0 { + valuesSQL.WriteString(",") + } + valuesSQL.WriteString("?") + } + valuesSQL.WriteString(")") + rowID := startID + int64(i) + for _, c := range cols { + args = append(args, buildRandomValue(rand.New(rand.NewSource(rowID)), tbl, c, rowID)) + } + } + + stmt := fmt.Sprintf("INSERT INTO %s (%s) VALUES %s", + tbl.fqName(), + backtickJoin(colNames), + valuesSQL.String(), + ) + + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return err + } + if _, err := tx.ExecContext(ctx, stmt, args...); err != nil { + _ = tx.Rollback() + return err + } + return tx.Commit() +} + +func conflictWriter( + ctx context.Context, + db *sql.DB, + model *clusterModel, + seed int64, + cfg dmlConfig, + counters *dmlCounters, +) { + // conflictWriter continuously upserts into a small key space to create write conflicts. + // This targets row-level contention and duplicate key paths. + if !cfg.KeyConflictEnabled || cfg.KeyConflictKeyspace <= 0 { + return + } + rng := rand.New(rand.NewSource(seed)) + targetTables := collectConflictTables(model) + if len(targetTables) == 0 { + return + } + + for { + select { + case <-ctx.Done(): + return + default: + } + + tbl := targetTables[rng.Intn(len(targetTables))] + tbl.mu.Lock() + exists := tbl.exists + tbl.mu.Unlock() + if !exists { + _ = sleepWithContext(ctx, 200*time.Millisecond) + continue + } + + key := rng.Intn(cfg.KeyConflictKeyspace) + 1 + stmt := fmt.Sprintf("INSERT INTO %s (`id`,`a`,`b`,`c`,`d`,`e`,`bin`) VALUES (?,?,?,?,?,?,?) "+ + "ON DUPLICATE KEY UPDATE `a`=VALUES(`a`),`b`=VALUES(`b`),`c`=VALUES(`c`)", + tbl.fqName(), + ) + args := []any{ + int64(key), + int32(rng.Intn(1_000_000)), + randASCII(rng, 16), + deterministicDecimal(int64(key)), + deterministicTime(int64(key)), + fmt.Sprintf("{\"k\":%d}", key), + []byte(fmt.Sprintf("%064x", key)), + } + _, err := db.ExecContext(ctx, stmt, args...) + counters.record(err) + _ = sleepWithContext(ctx, 20*time.Millisecond) + } +} + +func collectConflictTables(model *clusterModel) []*table { + // Pick a stable target table for conflict writes to keep the workload deterministic. + var out []*table + for _, t := range model.churnTables { + // Use a single, deterministic churn family (t10) which is guaranteed to have `id` PK in initial schema. + if t.name == "t10" { + out = append(out, t) + } + } + return out +} diff --git a/tests/utils/random_ddl_test_runner/failover.go b/tests/utils/random_ddl_test_runner/failover.go new file mode 100644 index 0000000000..ddae881feb --- /dev/null +++ b/tests/utils/random_ddl_test_runner/failover.go @@ -0,0 +1,161 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "net/http" + "os" + "os/exec" + "strconv" + "time" +) + +func (r *runner) failoverLoop(ctx context.Context, motifStep *int32, trace *ddlTrace) error { + _ = motifStep + _ = trace + + // failoverLoop randomly kills and restarts captures to simulate process-level failover. + // + // This is only meaningful when the integration harness has started multiple captures + // (and the addresses are provided via config.failover.capture_addrs). + // + // The restart uses the integration helper scripts (run_cdc_server/kill_cdc_pid) which + // are expected to be in PATH when run under tests/integration_tests/*. + minD := r.cfg.Failover.MinInterval.Duration + maxD := r.cfg.Failover.MaxInterval.Duration + if maxD < minD { + maxD, minD = minD, maxD + } + if minD <= 0 { + minD = 20 * time.Second + } + if maxD <= 0 { + maxD = 40 * time.Second + } + if len(r.cfg.Failover.CaptureAddrs) == 0 { + return nil + } + + rng := rand.New(rand.NewSource(r.cfg.Seed + 30_000)) + restartRound := 0 + + for { + sleep := minD + if maxD > minD { + sleep = minD + time.Duration(rng.Int63n(int64(maxD-minD))) + } + if err := sleepWithContext(ctx, sleep); err != nil { + return nil + } + + if rng.Float64() < r.cfg.Failover.GatedProbability { + // Optional gating: avoid failover when checkpoint is not advancing to reduce + // the chance of amplifying an existing stall. + st1, err := r.getChangefeedStatus(ctx) + if err != nil { + return err + } + if err := sleepWithContext(ctx, 3*time.Second); err != nil { + return nil + } + st2, err := r.getChangefeedStatus(ctx) + if err != nil { + return err + } + if st1.Checkpoint != 0 && st1.Checkpoint == st2.Checkpoint { + r.logger.Printf("failover gated: checkpoint did not advance (%d)", st1.Checkpoint) + continue + } + } + + target := r.cfg.Failover.CaptureAddrs[rng.Intn(len(r.cfg.Failover.CaptureAddrs))] + pid, err := getCapturePID(ctx, target, r.cfg.CDC.User, r.cfg.CDC.Password) + if err != nil { + r.logger.Printf("failover: cannot get pid addr=%s err=%v", target, err) + continue + } + if pid == 0 { + r.logger.Printf("failover: empty pid addr=%s", target) + continue + } + + r.logger.Printf("failover: killing capture addr=%s pid=%d", target, pid) + if err := execCommand(ctx, "kill_cdc_pid", strconv.Itoa(pid)); err != nil { + r.logger.Printf("failover: kill failed pid=%d err=%v", pid, err) + continue + } + + restartRound++ + suffix := fmt.Sprintf("failover-%d", restartRound) + r.logger.Printf("failover: restarting capture addr=%s suffix=%s", target, suffix) + var lastErr error + for attempt := 0; attempt < 3; attempt++ { + lastErr = execCommand(ctx, "run_cdc_server", + "--workdir", r.cfg.Workdir, + "--binary", r.cfg.Failover.CdcBinary, + "--logsuffix", suffix, + "--addr", target, + ) + if lastErr == nil { + break + } + r.logger.Printf("failover: restart attempt=%d err=%v", attempt+1, lastErr) + if err := sleepWithContext(ctx, 3*time.Second); err != nil { + return nil + } + } + if lastErr != nil { + return lastErr + } + } +} + +func getCapturePID(ctx context.Context, addr, user, password string) (int, error) { + u := fmt.Sprintf("http://%s/status", addr) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return 0, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return 0, err + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusUnauthorized { + req2, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return 0, err + } + req2.SetBasicAuth(user, password) + resp2, err := http.DefaultClient.Do(req2) + if err != nil { + return 0, err + } + defer resp2.Body.Close() + resp = resp2 + } + + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("status http %d", resp.StatusCode) + } + + var raw map[string]any + if err := json.NewDecoder(resp.Body).Decode(&raw); err != nil { + return 0, err + } + p, ok := raw["pid"].(float64) + if !ok { + return 0, nil + } + return int(p), nil +} + +func execCommand(ctx context.Context, name string, args ...string) error { + cmd := exec.CommandContext(ctx, name, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} diff --git a/tests/utils/random_ddl_test_runner/health.go b/tests/utils/random_ddl_test_runner/health.go new file mode 100644 index 0000000000..12287982d5 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/health.go @@ -0,0 +1,88 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "time" +) + +type changefeedStatus struct { + State string + Checkpoint uint64 +} + +func (r *runner) getChangefeedStatus(ctx context.Context) (changefeedStatus, error) { + // Query TiCDC OpenAPI to obtain changefeed state and checkpoint tso. + // + // The integration tests use a fixed basic auth user/password. This runner keeps the + // request logic small and dependency-free to remain easy to vendor into test envs. + if r.cfg.CDC.ChangefeedID == "" { + return changefeedStatus{}, fmt.Errorf("cdc.changefeed_id is required") + } + + u := url.URL{ + Scheme: "http", + Host: r.cfg.CDC.Addr, + Path: "/api/v2/changefeeds/" + url.PathEscape(r.cfg.CDC.ChangefeedID) + "/status", + } + q := u.Query() + q.Set("keyspace", r.cfg.CDC.Keyspace) + u.RawQuery = q.Encode() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) + if err != nil { + return changefeedStatus{}, err + } + req.SetBasicAuth(r.cfg.CDC.User, r.cfg.CDC.Password) + + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + return changefeedStatus{}, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return changefeedStatus{}, fmt.Errorf("cdc status http %d: %s", resp.StatusCode, string(b)) + } + + var raw map[string]any + dec := json.NewDecoder(resp.Body) + if err := dec.Decode(&raw); err != nil { + return changefeedStatus{}, err + } + + state, _ := raw["state"].(string) + checkpoint := parseUint64(raw["checkpoint_tso"]) + if checkpoint == 0 { + checkpoint = parseUint64(raw["checkpoint_ts"]) + } + return changefeedStatus{ + State: state, + Checkpoint: checkpoint, + }, nil +} + +func parseUint64(v any) uint64 { + // TiCDC API fields may be encoded as number or string depending on endpoint/version. + switch x := v.(type) { + case float64: + if x < 0 { + return 0 + } + return uint64(x) + case string: + n, err := strconv.ParseUint(x, 10, 64) + if err != nil { + return 0 + } + return n + default: + return 0 + } +} diff --git a/tests/utils/random_ddl_test_runner/logger.go b/tests/utils/random_ddl_test_runner/logger.go new file mode 100644 index 0000000000..00cdb6d0ab --- /dev/null +++ b/tests/utils/random_ddl_test_runner/logger.go @@ -0,0 +1,21 @@ +package main + +import ( + "io" + "log" + "os" + "path/filepath" +) + +func newRunnerLogger(workdir string) (*log.Logger, func(), error) { + if err := os.MkdirAll(workdir, 0o755); err != nil { + return nil, nil, err + } + f, err := os.OpenFile(filepath.Join(workdir, "runner.log"), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return nil, nil, err + } + w := io.MultiWriter(os.Stdout, f) + l := log.New(w, "", log.LstdFlags|log.Lmicroseconds|log.LUTC) + return l, func() { _ = f.Close() }, nil +} diff --git a/tests/utils/random_ddl_test_runner/logscan.go b/tests/utils/random_ddl_test_runner/logscan.go new file mode 100644 index 0000000000..205e305d6c --- /dev/null +++ b/tests/utils/random_ddl_test_runner/logscan.go @@ -0,0 +1,224 @@ +package main + +import ( + "bufio" + "bytes" + "fmt" + "io" + "log" + "os" + "path/filepath" + "sort" + "strings" +) + +func scanLogsForPatterns(workdir string, patterns []string, failOnMatch bool, logger *log.Logger) error { + // scanLogsForPatterns is a lightweight alternative to external tools (e.g., rg) so the + // runner can be used in minimal environments. + // + // Implementation notes: + // - Convert to lowercase on the fly and use bytes.Contains for substring search. + // - Use bufio.Reader.ReadLine to cap memory and handle very long lines by stitching + // a small suffix ("carry") across fragments. + files, err := collectLogFiles(workdir) + if err != nil { + return err + } + if len(files) == 0 { + if logger != nil { + logger.Printf("log scan: no log files found under %s", workdir) + } + return nil + } + + lowerPatterns := make([]string, 0, len(patterns)) + for _, p := range patterns { + lowerPatterns = append(lowerPatterns, strings.ToLower(p)) + } + + type hit struct { + file string + line int + pat string + } + var hits []hit + + maxPatternLen := 0 + patternBytes := make([][]byte, 0, len(lowerPatterns)) + for _, p := range lowerPatterns { + b := []byte(p) + patternBytes = append(patternBytes, b) + if scanPatternMaxLen(p) > maxPatternLen { + maxPatternLen = scanPatternMaxLen(p) + } + } + + for _, path := range files { + f, err := os.Open(path) + if err != nil { + return err + } + lineNo := 0 + reader := bufio.NewReaderSize(f, 256*1024) + + carry := make([]byte, 0, maxPatternLen) + scratch := make([]byte, 0, 256*1024) + tmp := make([]byte, 0, 256*1024) + lineMatched := false + + for { + part, isPrefix, err := reader.ReadLine() + if err != nil { + if err == io.EOF { + break + } + _ = f.Close() + return err + } + + if !lineMatched { + tmp = append(tmp[:0], part...) + for i := range tmp { + c := tmp[i] + if c >= 'A' && c <= 'Z' { + tmp[i] = c + ('a' - 'A') + } + } + + scratch = append(scratch[:0], carry...) + scratch = append(scratch, tmp...) + + for i, p := range patternBytes { + if logLineMatchesPattern(scratch, p, lowerPatterns[i]) { + hits = append(hits, hit{file: filepath.Base(path), line: lineNo + 1, pat: lowerPatterns[i]}) + lineMatched = true + break + } + } + } + + if !isPrefix { + lineNo++ + carry = carry[:0] + lineMatched = false + continue + } + // Keep a small suffix from the previous fragment to detect patterns spanning boundaries. + if len(scratch) == 0 { + carry = carry[:0] + continue + } + keep := maxPatternLen - 1 + if keep <= 0 { + carry = carry[:0] + continue + } + if keep > len(scratch) { + keep = len(scratch) + } + carry = append(carry[:0], scratch[len(scratch)-keep:]...) + } + + _ = f.Close() + } + + if len(hits) == 0 { + if logger != nil { + logger.Printf("log scan: no matches") + } + return nil + } + + sort.Slice(hits, func(i, j int) bool { + if hits[i].file != hits[j].file { + return hits[i].file < hits[j].file + } + return hits[i].line < hits[j].line + }) + + if logger != nil { + logger.Printf("log scan: found %d matches", len(hits)) + for i := 0; i < len(hits) && i < 20; i++ { + logger.Printf("log scan match: file=%s line=%d pattern=%q", hits[i].file, hits[i].line, hits[i].pat) + } + } + + if failOnMatch { + return fmt.Errorf("log scan found %d panic/fatal/race matches", len(hits)) + } + return nil +} + +var ( + logScanFatalLevel = []byte("[fatal]") + logScanFatalKV = []byte("level=fatal") + logScanFatalPrefix = []byte("fatal error:") + logScanPanicLevel = []byte("[panic]") + logScanPanicKV = []byte("level=panic") + logScanPanicPrefix = []byte("panic:") +) + +func scanPatternMaxLen(pattern string) int { + switch pattern { + case "fatal": + return maxInt(len(logScanFatalLevel), len(logScanFatalKV), len(logScanFatalPrefix)) + case "panic": + return maxInt(len(logScanPanicLevel), len(logScanPanicKV), len(logScanPanicPrefix)) + default: + return len(pattern) + } +} + +func logLineMatchesPattern(lowerLine, patternBytes []byte, pattern string) bool { + switch pattern { + case "fatal": + trimmed := bytes.TrimLeft(lowerLine, " \t") + return bytes.Contains(lowerLine, logScanFatalLevel) || + bytes.Contains(lowerLine, logScanFatalKV) || + bytes.HasPrefix(trimmed, logScanFatalPrefix) + case "panic": + trimmed := bytes.TrimLeft(lowerLine, " \t") + return bytes.Contains(lowerLine, logScanPanicLevel) || + bytes.Contains(lowerLine, logScanPanicKV) || + bytes.HasPrefix(trimmed, logScanPanicPrefix) + default: + return bytes.Contains(lowerLine, patternBytes) + } +} + +func maxInt(first int, rest ...int) int { + out := first + for _, v := range rest { + if v > out { + out = v + } + } + return out +} + +func collectLogFiles(workdir string) ([]string, error) { + globs := []string{ + filepath.Join(workdir, "runner.log"), + filepath.Join(workdir, "ddl_trace.log"), + filepath.Join(workdir, "stdout*.log"), + filepath.Join(workdir, "cdc*.log"), + filepath.Join(workdir, "cdc_*_consumer*.log"), + filepath.Join(workdir, "cdc_*_consumer_stdout*.log"), + } + seen := make(map[string]struct{}) + for _, g := range globs { + m, err := filepath.Glob(g) + if err != nil { + return nil, err + } + for _, p := range m { + seen[p] = struct{}{} + } + } + out := make([]string, 0, len(seen)) + for p := range seen { + out = append(out, p) + } + sort.Strings(out) + return out, nil +} diff --git a/tests/utils/random_ddl_test_runner/logscan_test.go b/tests/utils/random_ddl_test_runner/logscan_test.go new file mode 100644 index 0000000000..b440c95976 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/logscan_test.go @@ -0,0 +1,70 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestScanLogsForPatternsIgnoresPayloadSubstrings(t *testing.T) { + dir := t.TempDir() + content := []byte(` +[2026/06/16 13:31:34.281 +08:00] [DEBUG] [basic_dispatcher.go:600] ["dispatcher receive all event"] [event="Rows: Insert: Row: 1, Bb8bdTFTEIN9i3spwifGjZj3AmFAtalR;"] +[2026/06/16 13:36:34.814 +08:00] [DEBUG] [basic_dispatcher.go:600] ["dispatcher receive all event"] [event="Rows: Insert: Row: 2, 1YCs3x0WFrKYaheC3jpXpAnicxBqG3pe;"] +`) + if err := os.WriteFile(filepath.Join(dir, "cdc.log"), content, 0o644); err != nil { + t.Fatal(err) + } + + if err := scanLogsForPatterns(dir, []string{"panic", "fatal", "DATA RACE"}, true, nil); err != nil { + t.Fatalf("scanLogsForPatterns() unexpected error = %v", err) + } +} + +func TestScanLogsForPatternsDetectsFatalLogLevel(t *testing.T) { + dir := t.TempDir() + content := []byte(`[2026/06/16 13:31:34.281 +08:00] [FATAL] [server.go:1] ["failed"]`) + if err := os.WriteFile(filepath.Join(dir, "cdc.log"), content, 0o644); err != nil { + t.Fatal(err) + } + + if err := scanLogsForPatterns(dir, []string{"fatal"}, true, nil); err == nil { + t.Fatalf("scanLogsForPatterns() expected fatal log level match") + } +} + +func TestScanLogsForPatternsDetectsPanicPrefix(t *testing.T) { + dir := t.TempDir() + content := []byte(`panic: runtime error: invalid memory address`) + if err := os.WriteFile(filepath.Join(dir, "stdout.log"), content, 0o644); err != nil { + t.Fatal(err) + } + + if err := scanLogsForPatterns(dir, []string{"panic"}, true, nil); err == nil { + t.Fatalf("scanLogsForPatterns() expected panic prefix match") + } +} + +func TestScanLogsForPatternsDetectsFatalErrorPrefix(t *testing.T) { + dir := t.TempDir() + content := []byte(`fatal error: concurrent map writes`) + if err := os.WriteFile(filepath.Join(dir, "stdout.log"), content, 0o644); err != nil { + t.Fatal(err) + } + + if err := scanLogsForPatterns(dir, []string{"fatal"}, true, nil); err == nil { + t.Fatalf("scanLogsForPatterns() expected fatal error prefix match") + } +} + +func TestScanLogsForPatternsKeepsCustomSubstringMatch(t *testing.T) { + dir := t.TempDir() + content := []byte(`[INFO] custom marker appeared`) + if err := os.WriteFile(filepath.Join(dir, "cdc.log"), content, 0o644); err != nil { + t.Fatal(err) + } + + if err := scanLogsForPatterns(dir, []string{"custom marker"}, true, nil); err == nil { + t.Fatalf("scanLogsForPatterns() expected custom substring match") + } +} diff --git a/tests/utils/random_ddl_test_runner/main.go b/tests/utils/random_ddl_test_runner/main.go new file mode 100644 index 0000000000..ec05d3da08 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/main.go @@ -0,0 +1,91 @@ +package main + +import ( + "flag" + "fmt" + "os" +) + +// Command random_ddl_test_runner is a deterministic workload generator and verifier +// for TiCDC integration tests. +// +// It has two phases: +// - bootstrap: initialize identical schemas and deterministic seed data on both upstream and downstream. +// - workload: run concurrent random DML/DDL against upstream, while monitoring replication health. +// +// Typical end-to-end flow (driven by tests/integration_tests/*/run.sh): +// +// Components: +// +// H: integration run.sh (harness) +// R: random_ddl_test_runner (this binary) +// U: upstream TiDB +// C: TiCDC capture(s) +// S: sink consumer for MQ/file sinks (optional) +// D: downstream TiDB +// DF: sync_diff_inspector +// +// Sequence (simplified): +// +// H -> U,D: start_tidb_cluster +// H -> R: bootstrap +// R -> U,D: create db/table + insert deterministic rows +// H -> C: start capture(s) +// H -> C: create changefeed (sink uri) +// H -> S: start consumer (optional) +// H -> R: workload +// R -> U: random DML/DDL + motif DDL sequence +// R -> C: poll changefeed status (checkpoint/state) and auto-tune workload +// R -> U: insert finish_mark (replication barrier) +// R -> D: wait finish_mark visible (catch up) +// R -> H: write diff_config.toml +// H -> DF: final upstream vs downstream diff +// +// The runner writes artifacts into , including: +// - runner.log / ddl_trace.log: runner-side logs and executed DDL traces. +// - runner_config.snapshot.json: the effective config used for this run. +// - diff_config.toml: final sync_diff_inspector config for the end-to-end diff. +// - diff_config_syncpoint_.toml: optional syncpoint snapshot diff configs (MySQL sink). +func main() { + var configPath string + var phase string + + flag.StringVar(&configPath, "config", "", "path to runner config json") + flag.StringVar(&phase, "phase", "", "bootstrap|workload") + flag.Parse() + + if configPath == "" || phase == "" { + _, _ = fmt.Fprintln(os.Stderr, "usage: random_ddl_test_runner --config --phase ") + os.Exit(2) + } + + cfg, err := loadConfig(configPath) + if err != nil { + _, _ = fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + + logger, closeLogger, err := newRunnerLogger(cfg.Workdir) + if err != nil { + _, _ = fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + defer closeLogger() + + runner := newRunner(cfg, logger) + + switch phase { + case "bootstrap": + err = runner.bootstrap() + case "workload": + err = runner.workload() + default: + err = fmt.Errorf("unknown phase: %s", phase) + } + if err != nil { + logger.Printf("runner failed: %v", err) + _, _ = fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + logger.Printf("runner finished successfully") +} diff --git a/tests/utils/random_ddl_test_runner/model.go b/tests/utils/random_ddl_test_runner/model.go new file mode 100644 index 0000000000..cb48349063 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/model.go @@ -0,0 +1,469 @@ +package main + +import ( + "fmt" + "math" + "math/rand" + "strings" + "sync" + "time" +) + +type domain string + +const ( + domainStable domain = "stable" + domainChurn domain = "churn" + domainSplit domain = "split_candidate" +) + +type colType struct { + base string + varcharN int + decimalP int + decimalS int + varbinaryN int +} + +func (t colType) sql() string { + switch strings.ToUpper(t.base) { + case "VARCHAR": + return fmt.Sprintf("VARCHAR(%d)", t.varcharN) + case "VARBINARY": + return fmt.Sprintf("VARBINARY(%d)", t.varbinaryN) + case "DECIMAL": + return fmt.Sprintf("DECIMAL(%d,%d)", t.decimalP, t.decimalS) + default: + return strings.ToUpper(t.base) + } +} + +type column struct { + name string + typ colType + nullable bool + defaultSQL string + generated string + stored bool +} + +func (c column) sql() string { + var b strings.Builder + b.WriteString("`") + b.WriteString(c.name) + b.WriteString("` ") + b.WriteString(c.typ.sql()) + if c.generated != "" { + kind := "VIRTUAL" + if c.stored { + kind = "STORED" + } + b.WriteString(" GENERATED ALWAYS AS (") + b.WriteString(c.generated) + b.WriteString(") ") + b.WriteString(kind) + } + if !c.nullable { + b.WriteString(" NOT NULL") + } + if c.defaultSQL != "" && c.generated == "" { + b.WriteString(" DEFAULT ") + b.WriteString(c.defaultSQL) + } + return b.String() +} + +type index struct { + name string + columns []string + unique bool +} + +func (idx index) sql() string { + var b strings.Builder + if idx.unique { + b.WriteString("UNIQUE KEY ") + } else { + b.WriteString("KEY ") + } + b.WriteString("`") + b.WriteString(idx.name) + b.WriteString("` (") + for i, c := range idx.columns { + if i > 0 { + b.WriteString(",") + } + b.WriteString("`") + b.WriteString(c) + b.WriteString("`") + } + b.WriteString(")") + return b.String() +} + +type tableSchema struct { + columns []column + primaryKey []string + indexes []index + charset string + collation string + partitionSQL string +} + +func (s tableSchema) clone() tableSchema { + cp := s + cp.columns = append([]column(nil), s.columns...) + cp.primaryKey = append([]string(nil), s.primaryKey...) + cp.indexes = append([]index(nil), s.indexes...) + return cp +} + +func (s tableSchema) createTableSQL(dbName, tableName string) string { + var b strings.Builder + b.WriteString("CREATE TABLE IF NOT EXISTS `") + b.WriteString(dbName) + b.WriteString("`.`") + b.WriteString(tableName) + b.WriteString("` (") + for i, c := range s.columns { + if i > 0 { + b.WriteString(",") + } + b.WriteString("\n ") + b.WriteString(c.sql()) + } + if len(s.primaryKey) > 0 { + b.WriteString(",\n PRIMARY KEY (") + for i, c := range s.primaryKey { + if i > 0 { + b.WriteString(",") + } + b.WriteString("`") + b.WriteString(c) + b.WriteString("`") + } + b.WriteString(")") + } + for _, idx := range s.indexes { + b.WriteString(",\n ") + b.WriteString(idx.sql()) + } + b.WriteString("\n) ") + if s.charset != "" { + b.WriteString("DEFAULT CHARSET=") + b.WriteString(s.charset) + b.WriteString(" ") + } + if s.collation != "" { + b.WriteString("COLLATE=") + b.WriteString(s.collation) + b.WriteString(" ") + } + if s.partitionSQL != "" { + b.WriteString(s.partitionSQL) + b.WriteString(" ") + } + return b.String() +} + +type tableFamily struct { + id int + name string + domain domain + schema tableSchema + isMotif bool +} + +func defaultDatabaseNames() []string { + return []string{"db1", "db2", "db3", "db4", "db5"} +} + +func familyName(i int) string { + return fmt.Sprintf("t%02d", i) +} + +func defaultTableFamilies() []tableFamily { + base := func() tableSchema { + return tableSchema{ + columns: []column{ + {name: "id", typ: colType{base: "BIGINT"}, nullable: false}, + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "VARCHAR", varcharN: 64}, nullable: false}, + {name: "c", typ: colType{base: "DECIMAL", decimalP: 10, decimalS: 2}, nullable: false, defaultSQL: "0"}, + {name: "d", typ: colType{base: "DATETIME"}, nullable: false}, + {name: "e", typ: colType{base: "JSON"}, nullable: true}, + {name: "bin", typ: colType{base: "VARBINARY", varbinaryN: 64}, nullable: true}, + }, + primaryKey: []string{"id"}, + } + } + + // Motif table family starts with a not-null unique key and will evolve schema during workload. + motif := tableSchema{ + columns: []column{ + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "INT"}, nullable: false}, + }, + indexes: []index{{name: "uk_a", columns: []string{"a"}, unique: true}}, + } + + gbk := base() + gbk.charset = "gbk" + gbk.collation = "gbk_bin" + + // Avoid generated columns in baseline schemas because the current storage sink CSV pipeline + // (cloud storage sink + storage consumer) does not fully support generated columns. + gen := tableSchema{ + columns: []column{ + {name: "id", typ: colType{base: "BIGINT"}, nullable: false}, + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "VARBINARY", varbinaryN: 64}, nullable: false}, + {name: "c", typ: colType{base: "VARCHAR", varcharN: 64}, nullable: false}, + }, + primaryKey: []string{"id"}, + } + + keyless := tableSchema{ + columns: []column{ + {name: "id", typ: colType{base: "BIGINT"}, nullable: false}, + {name: "a", typ: colType{base: "INT"}, nullable: false}, + {name: "b", typ: colType{base: "VARCHAR", varcharN: 64}, nullable: false}, + {name: "c", typ: colType{base: "VARCHAR", varcharN: 128}, nullable: false}, + }, + primaryKey: []string{"id"}, + } + + rangePart := base() + rangePart.partitionSQL = "PARTITION BY RANGE (`id`) (PARTITION p0 VALUES LESS THAN (1000000000000), PARTITION p1 VALUES LESS THAN (2000000000000), PARTITION p2 VALUES LESS THAN (3000000000000))" + + hashPart := base() + hashPart.partitionSQL = "PARTITION BY HASH (`id`) PARTITIONS 4" + + split := tableSchema{ + columns: []column{ + {name: "id", typ: colType{base: "BIGINT"}, nullable: false}, + {name: "v", typ: colType{base: "INT"}, nullable: false}, + {name: "pad", typ: colType{base: "VARCHAR", varcharN: 1024}, nullable: false}, + {name: "ts", typ: colType{base: "DATETIME"}, nullable: false}, + }, + primaryKey: []string{"id"}, + partitionSQL: "PARTITION BY RANGE (`id`) (PARTITION p0 VALUES LESS THAN (1000000000000), PARTITION p1 VALUES LESS THAN (2000000000000), PARTITION p2 VALUES LESS THAN (3000000000000))", + } + + families := make([]tableFamily, 0, 20) + for i := 0; i < 20; i++ { + f := tableFamily{ + id: i, + name: familyName(i), + domain: domainStable, + schema: base(), + } + switch { + case i <= 9: + f.domain = domainStable + case i <= 15: + f.domain = domainChurn + default: + f.domain = domainSplit + } + + switch i { + case 0: + f.schema = base() + case 1: + s := base() + s.indexes = append(s.indexes, index{name: "uk_b", columns: []string{"b"}, unique: true}) + f.schema = s + case 2: + s := base() + s.indexes = append(s.indexes, index{name: "idx_a", columns: []string{"a"}, unique: false}) + f.schema = s + case 3: + f.schema = motif + f.isMotif = true + case 4: + s := base() + s.indexes = append(s.indexes, index{name: "uk_a_b", columns: []string{"a", "b"}, unique: true}) + f.schema = s + case 5: + f.schema = keyless + case 6: + f.schema = gen + case 7: + f.schema = rangePart + case 8: + f.schema = hashPart + case 9: + f.schema = gbk + case 16, 17, 18, 19: + f.schema = split + default: + f.schema = base() + } + families = append(families, f) + } + return families +} + +type table struct { + // mu guards mutable state (schema, name, exists, nextID, and motif markers). + mu sync.Mutex + db string + name string + domain domain + family int + isMotif bool + initialSchema tableSchema + schema tableSchema + exists bool + + nextID int64 + frozen map[int64]struct{} + + hot bool + + rangePartitionNextID int + rangePartitionNextBound int64 + + motifUnifiedStart int64 +} + +func (t *table) fqName() string { + return fmt.Sprintf("`%s`.`%s`", t.db, t.name) +} + +type clusterModel struct { + // clusterModel is an in-memory approximation of the workload surface used for + // generating DML and DDL. It is updated only when a DDL succeeds on upstream. + dbs []string + tables []*table + hotTables []*table + coldTables []*table + stableTables []*table + churnTables []*table + splitTables []*table +} + +func buildInitialModel(cfg *config) *clusterModel { + // buildInitialModel constructs the initial schema model used by both bootstrap and workload. + // It must remain deterministic for a given config so tests can be reproduced by seed. + dbs := defaultDatabaseNames() + families := defaultTableFamilies() + + var tables []*table + for _, dbName := range dbs { + for _, fam := range families { + schema := fam.schema.clone() + tbl := &table{ + db: dbName, + name: fam.name, + domain: fam.domain, + family: fam.id, + isMotif: fam.isMotif, + initialSchema: schema.clone(), + schema: schema, + exists: true, + frozen: make(map[int64]struct{}), + } + if strings.Contains(strings.ToUpper(schema.partitionSQL), "RANGE") { + // The initial schemas using RANGE partitioning in this runner define p0/p1/p2. + tbl.rangePartitionNextID = 3 + tbl.rangePartitionNextBound = 3_000_000_000_000 + } + // Use deterministic initial row ranges. + baseRows := int64(cfg.Bootstrap.BaseRowsPerTable) + splitRows := int64(cfg.Bootstrap.SplitRowsPerTable) + switch fam.domain { + case domainSplit: + tbl.nextID = baseRows + splitRows + 1 + default: + tbl.nextID = baseRows + 1 + } + + // Mark per-db hot tables: one stable + one split candidate. + if fam.name == "t00" || fam.name == "t16" { + tbl.hot = true + } + tables = append(tables, tbl) + } + } + + m := &clusterModel{dbs: dbs, tables: tables} + for _, t := range tables { + if t.hot { + m.hotTables = append(m.hotTables, t) + } else { + m.coldTables = append(m.coldTables, t) + } + switch t.domain { + case domainStable: + m.stableTables = append(m.stableTables, t) + case domainChurn: + m.churnTables = append(m.churnTables, t) + case domainSplit: + m.splitTables = append(m.splitTables, t) + } + } + return m +} + +func (m *clusterModel) pickTableForDML(rng *rand.Rand, hotspotRatio float64) *table { + // Prefer "hot" tables with a configurable probability to create hotspot pressure. + if len(m.hotTables) == 0 || len(m.coldTables) == 0 { + return m.tables[rng.Intn(len(m.tables))] + } + if rng.Float64() < hotspotRatio { + return m.hotTables[rng.Intn(len(m.hotTables))] + } + return m.coldTables[rng.Intn(len(m.coldTables))] +} + +func (m *clusterModel) pickTableForDomain(rng *rand.Rand, d domain) *table { + var candidates []*table + switch d { + case domainStable: + candidates = m.stableTables + case domainChurn: + candidates = m.churnTables + case domainSplit: + candidates = m.splitTables + default: + candidates = m.tables + } + if len(candidates) == 0 { + return nil + } + // Retry a few times to avoid frequently selecting dropped churn tables. + for i := 0; i < 10; i++ { + t := candidates[rng.Intn(len(candidates))] + t.mu.Lock() + exists := t.exists + t.mu.Unlock() + if exists { + return t + } + } + return candidates[rng.Intn(len(candidates))] +} + +func deterministicInt64(x int64) int64 { + // A cheap LCG step to mix inputs deterministically. + const a = 6364136223846793005 + const c = 1442695040888963407 + return a*x + c +} + +func asciiStringFromID(prefix string, id int64) string { + // Stable ASCII-only payload. + return fmt.Sprintf("%s_%d", prefix, id) +} + +func deterministicDecimal(id int64) float64 { + return math.Mod(float64(deterministicInt64(id)%100000), 10000) / 100.0 +} + +func deterministicTime(id int64) time.Time { + // Keep within a reasonable range for MySQL/TiDB. + base := int64(1700000000) + return time.Unix(base+(id%86400), 0).UTC() +} diff --git a/tests/utils/random_ddl_test_runner/motif.go b/tests/utils/random_ddl_test_runner/motif.go new file mode 100644 index 0000000000..32163014fc --- /dev/null +++ b/tests/utils/random_ddl_test_runner/motif.go @@ -0,0 +1,166 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "log" + "sync/atomic" + "time" +) + +// Motif steps: +// 0: initial +// 1: site_code column added with per-db defaults +// 2: site_code default unified to empty string +// 3: primary key evolved to (a, site_code) +func runPrimaryMotif( + ctx context.Context, + db *sql.DB, + model *clusterModel, + motifStep *int32, + trace *ddlTrace, + logger *log.Logger, + profile string, +) { + // The motif is a deterministic DDL sequence on a single table family (t03) intended to + // cover tricky replication cases: + // - adding a new NOT NULL column with different defaults per database + // - unifying defaults over time + // - evolving primary keys after data already exists + // + // DML workers consult motifStep to adjust their write patterns accordingly. + step1At, step2At, step3At := motifSchedule(profile) + + if err := sleepWithContext(ctx, step1At); err != nil { + return + } + if err := motifAddSiteCode(ctx, db, model, trace, logger); err == nil { + atomic.StoreInt32(motifStep, 1) + } + + if err := sleepWithContext(ctx, step2At-step1At); err != nil { + return + } + if err := motifUnifySiteCodeDefault(ctx, db, model, trace, logger); err == nil { + atomic.StoreInt32(motifStep, 2) + } + + if err := sleepWithContext(ctx, step3At-step2At); err != nil { + return + } + if err := motifAddCompositePK(ctx, db, model, trace, logger); err == nil { + atomic.StoreInt32(motifStep, 3) + } +} + +func motifSchedule(profile string) (time.Duration, time.Duration, time.Duration) { + // Use a profile-based schedule so that smoke runs complete all steps quickly, + // while weekly runs keep more steady-state time between transitions. + if profile == "weekly" { + return 2 * time.Minute, 10 * time.Minute, 20 * time.Minute + } + // Smoke mode: ensure all steps execute within a short run. + return 10 * time.Second, 40 * time.Second, 70 * time.Second +} + +func motifAddSiteCode(ctx context.Context, db *sql.DB, model *clusterModel, trace *ddlTrace, logger *log.Logger) error { + for i, dbName := range model.dbs { + defaultVal := fmt.Sprintf("%d", (i+1)*100) + sqlText := fmt.Sprintf("ALTER TABLE `%s`.`t03` ADD COLUMN `site_code` VARCHAR(64) NOT NULL DEFAULT '%s'", + dbName, defaultVal) + _, err := db.ExecContext(ctx, sqlText) + if trace != nil { + trace.record("motif_add_site_code", fmt.Sprintf("`%s`.`t03`", dbName), sqlText, err) + } + if err != nil { + if logger != nil { + logger.Printf("motif step1 failed: db=%s err=%v", dbName, err) + } + return err + } + updateMotifSchemaAfterAdd(dbName, model, defaultVal) + } + return nil +} + +func motifUnifySiteCodeDefault(ctx context.Context, db *sql.DB, model *clusterModel, trace *ddlTrace, logger *log.Logger) error { + for _, dbName := range model.dbs { + sqlText := fmt.Sprintf("ALTER TABLE `%s`.`t03` MODIFY COLUMN `site_code` VARCHAR(64) NOT NULL DEFAULT ''", dbName) + _, err := db.ExecContext(ctx, sqlText) + if trace != nil { + trace.record("motif_unify_site_code_default", fmt.Sprintf("`%s`.`t03`", dbName), sqlText, err) + } + if err != nil { + if logger != nil { + logger.Printf("motif step2 failed: db=%s err=%v", dbName, err) + } + return err + } + updateMotifSchemaAfterUnify(dbName, model) + } + return nil +} + +func motifAddCompositePK(ctx context.Context, db *sql.DB, model *clusterModel, trace *ddlTrace, logger *log.Logger) error { + for _, dbName := range model.dbs { + sqlText := fmt.Sprintf("ALTER TABLE `%s`.`t03` ADD PRIMARY KEY (`a`,`site_code`)", dbName) + _, err := db.ExecContext(ctx, sqlText) + if trace != nil { + trace.record("motif_add_pk", fmt.Sprintf("`%s`.`t03`", dbName), sqlText, err) + } + if err != nil { + if logger != nil { + logger.Printf("motif step3 failed: db=%s err=%v", dbName, err) + } + return err + } + updateMotifSchemaAfterAddPK(dbName, model) + } + return nil +} + +func updateMotifSchemaAfterAdd(dbName string, model *clusterModel, defaultVal string) { + for _, t := range model.tables { + if t.db != dbName || !t.isMotif || t.name != "t03" { + continue + } + t.mu.Lock() + t.schema.columns = append(t.schema.columns, column{ + name: "site_code", + typ: colType{base: "VARCHAR", varcharN: 64}, + nullable: false, + defaultSQL: fmt.Sprintf("'%s'", defaultVal), + }) + t.mu.Unlock() + } +} + +func updateMotifSchemaAfterUnify(dbName string, model *clusterModel) { + for _, t := range model.tables { + if t.db != dbName || !t.isMotif || t.name != "t03" { + continue + } + t.mu.Lock() + for i := range t.schema.columns { + if t.schema.columns[i].name == "site_code" { + t.schema.columns[i].defaultSQL = "''" + break + } + } + // Record the boundary so later updates can avoid touching frozen rows (which have non-empty site_code). + t.motifUnifiedStart = t.nextID + t.mu.Unlock() + } +} + +func updateMotifSchemaAfterAddPK(dbName string, model *clusterModel) { + for _, t := range model.tables { + if t.db != dbName || !t.isMotif || t.name != "t03" { + continue + } + t.mu.Lock() + t.schema.primaryKey = []string{"a", "site_code"} + t.mu.Unlock() + } +} diff --git a/tests/utils/random_ddl_test_runner/runner.go b/tests/utils/random_ddl_test_runner/runner.go new file mode 100644 index 0000000000..75365c6d87 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/runner.go @@ -0,0 +1,18 @@ +package main + +import ( + "log" +) + +type runner struct { + cfg *config + logger *log.Logger +} + +func newRunner(cfg *config, logger *log.Logger) *runner { + // runner is a thin orchestrator. Heavy logic lives in bootstrap.go/workload.go. + return &runner{ + cfg: cfg, + logger: logger, + } +} diff --git a/tests/utils/random_ddl_test_runner/selector.go b/tests/utils/random_ddl_test_runner/selector.go new file mode 100644 index 0000000000..ba4759b2d3 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/selector.go @@ -0,0 +1,71 @@ +package main + +import ( + "math/rand" + "sync" +) + +type ddlSelector struct { + mu sync.Mutex + windowSize int + window []string + counts map[string]int + kinds []ddlKind +} + +func newDDLSelector(kinds []ddlKind, windowSize int) *ddlSelector { + // The selector is stateful: it reduces weight for DDL kinds that were recently successful, + // which helps spread coverage and avoids repeatedly hammering the same schema change. + return &ddlSelector{ + windowSize: windowSize, + counts: make(map[string]int), + kinds: kinds, + } +} + +func (s *ddlSelector) pick(rng *rand.Rand) ddlKind { + // Pick with dynamic weights: + // weight(kind) = baseWeight / (1 + recentSuccessCount(kind)) + // where recentSuccessCount is tracked in a sliding window of the last N successes. + s.mu.Lock() + defer s.mu.Unlock() + + weights := make([]float64, 0, len(s.kinds)) + var sum float64 + for _, k := range s.kinds { + count := s.counts[k.name] + w := k.baseWeight / float64(1+count) + if w < 0.001 { + w = 0.001 + } + weights = append(weights, w) + sum += w + } + x := rng.Float64() * sum + var acc float64 + for i, w := range weights { + acc += w + if x <= acc { + return s.kinds[i] + } + } + return s.kinds[len(s.kinds)-1] +} + +func (s *ddlSelector) record(kindName string) { + // Record a successful DDL kind for weight dampening in a bounded window. + s.mu.Lock() + defer s.mu.Unlock() + + s.window = append(s.window, kindName) + s.counts[kindName]++ + if len(s.window) <= s.windowSize { + return + } + evicted := s.window[0] + s.window = s.window[1:] + s.counts[evicted]-- + if s.counts[evicted] <= 0 { + delete(s.counts, evicted) + } +} diff --git a/tests/utils/random_ddl_test_runner/selector_test.go b/tests/utils/random_ddl_test_runner/selector_test.go new file mode 100644 index 0000000000..42404d874d --- /dev/null +++ b/tests/utils/random_ddl_test_runner/selector_test.go @@ -0,0 +1,30 @@ +package main + +import ( + "math/rand" + "testing" +) + +func TestDDLSelector_CoverageDebtLikeBehavior(t *testing.T) { + kinds := []ddlKind{ + {name: "a", baseWeight: 1}, + {name: "b", baseWeight: 1}, + } + s := newDDLSelector(kinds, 100) + for i := 0; i < 50; i++ { + s.record("a") + } + rng := rand.New(rand.NewSource(1)) + var ca, cb int + for i := 0; i < 1000; i++ { + k := s.pick(rng) + if k.name == "a" { + ca++ + } else if k.name == "b" { + cb++ + } + } + if cb <= ca { + t.Fatalf("expected b to be selected more often than a, got a=%d b=%d", ca, cb) + } +} diff --git a/tests/utils/random_ddl_test_runner/syncpoint_diff.go b/tests/utils/random_ddl_test_runner/syncpoint_diff.go new file mode 100644 index 0000000000..a460b7e8c3 --- /dev/null +++ b/tests/utils/random_ddl_test_runner/syncpoint_diff.go @@ -0,0 +1,471 @@ +package main + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "sort" + "strings" + "sync/atomic" + "time" + + "github.com/go-sql-driver/mysql" +) + +type ddlWindow struct { + start uint64 + end uint64 +} + +const mysqlErrNoSuchTable uint16 = 1146 + +func (r *runner) syncpointDiffLoop( + ctx context.Context, + up *sql.DB, + down *sql.DB, + model *clusterModel, + trace *ddlTrace, + successCounter *int32, +) error { + _ = up + _ = trace + + // syncpointDiffLoop periodically runs snapshot diffs based on TiCDC syncpoints. + // + // Motivation: + // - The final diff runs at the end of the test and may not pinpoint when divergence happened. + // - Syncpoints provide pairs of (primary_ts on upstream, secondary_ts on downstream) that + // can be used for snapshot reads. Running diffs at several syncpoints helps localize issues. + // + // Practicality: + // - Snapshot diffing is fragile near DDL windows. We conservatively skip candidates that fall + // into TiDB DDL windows obtained from upstream /ddl/history. + if r.cfg.MySQL.DiffInterval.Duration <= 0 { + return nil + } + ticker := time.NewTicker(r.cfg.MySQL.DiffInterval.Duration) + defer ticker.Stop() + + var ( + lastPrimary uint64 + checked int + ) + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + } + if checked >= r.cfg.MySQL.MaxDiffChecks { + return nil + } + + n, err := r.runSyncpointDiffChecks(ctx, down, model, 1, &lastPrimary, false) + if err != nil { + return err + } + if n > 0 { + checked += n + if successCounter != nil { + atomic.StoreInt32(successCounter, int32(checked)) + } + } + } +} + +func (r *runner) runSyncpointDiffChecks( + ctx context.Context, + down *sql.DB, + model *clusterModel, + required int, + lastPrimary *uint64, + allowInDDLWindow bool, +) (int, error) { + // Run up to "required" syncpoint diffs and update lastPrimary to advance the cursor. + if required <= 0 { + return 0, nil + } + if lastPrimary == nil { + return 0, fmt.Errorf("lastPrimary must not be nil") + } + + windows, err := fetchDDLWindows(ctx, r.cfg.MySQL.UpstreamStatusHost, r.cfg.MySQL.UpstreamStatusPort) + if err != nil { + return 0, err + } + + checked := 0 + for tries := 0; tries < 50 && checked < required; tries++ { + p, s, got, err := pickNextSyncpointCandidate(ctx, down, *lastPrimary) + if err != nil { + return checked, err + } + if !got { + return checked, nil + } + inWindow := inDDLWindow(p, windows) + if inWindow && !allowInDDLWindow { + r.logger.Printf("syncpoint diff: skip primary_ts=%d (in DDL window)", p) + *lastPrimary = p + continue + } + + confPath := filepath.Join(r.cfg.Workdir, fmt.Sprintf("diff_config_syncpoint_%d.toml", p)) + if err := r.writeSyncpointDiffConfig(confPath, model, p, s); err != nil { + return checked, err + } + + logPath := filepath.Join(r.cfg.Workdir, fmt.Sprintf("sync_diff_inspector_syncpoint_%d.log", p)) + diag, err := r.runSyncDiffInspectorWithSnapshotGuard(ctx, confPath, logPath, 3) + if err != nil { + if ctx.Err() != nil { + return checked, nil + } + if isSkippableSyncDiffFailure(diag) { + r.logger.Printf("syncpoint diff: skip primary_ts=%d (sync diff not applicable, see %s)", p, logPath) + *lastPrimary = p + continue + } + if inWindow { + r.logger.Printf("syncpoint diff: skip primary_ts=%d (diff failed in DDL window, see %s)", p, logPath) + *lastPrimary = p + continue + } + return checked, err + } + + checked++ + *lastPrimary = p + r.logger.Printf("syncpoint diff: success primary_ts=%d secondary_ts=%d", p, s) + } + return checked, nil +} + +func (r *runner) ensureSyncpointDiffAfterWorkload( + ctx context.Context, + down *sql.DB, + model *clusterModel, + required int, +) error { + if required <= 0 { + return nil + } + var lastPrimary uint64 + checked := 0 + for checked < required { + n, err := r.runSyncpointDiffChecks(ctx, down, model, required-checked, &lastPrimary, true) + if err != nil { + return err + } + checked += n + if checked >= required { + return nil + } + select { + case <-ctx.Done(): + return fmt.Errorf("syncpoint diff did not complete: required=%d checked=%d: %w", required, checked, ctx.Err()) + case <-time.After(5 * time.Second): + } + } + return nil +} + +func pickNextSyncpointCandidate(ctx context.Context, down *sql.DB, after uint64) (primary uint64, secondary uint64, ok bool, err error) { + queryCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + rows, err := down.QueryContext(queryCtx, + "SELECT primary_ts, secondary_ts FROM tidb_cdc.syncpoint_v1 WHERE primary_ts > ? ORDER BY primary_ts ASC LIMIT 200", + after, + ) + if err != nil { + if isNoSuchTableError(err) { + // TiCDC creates tidb_cdc.syncpoint_v1 lazily when the first syncpoint is flushed. + // Treat a missing table as "no candidate yet" so early periodic checks keep waiting. + return 0, 0, false, nil + } + return 0, 0, false, err + } + defer rows.Close() + + for rows.Next() { + var p, s uint64 + if err := rows.Scan(&p, &s); err != nil { + return 0, 0, false, err + } + return p, s, true, nil + } + if err := rows.Err(); err != nil { + return 0, 0, false, err + } + return 0, 0, false, nil +} + +func isNoSuchTableError(err error) bool { + var mysqlErr *mysql.MySQLError + return errors.As(err, &mysqlErr) && mysqlErr.Number == mysqlErrNoSuchTable +} + +func fetchDDLWindows(ctx context.Context, host string, port int) ([]ddlWindow, error) { + // TiDB exposes recent DDL jobs via /ddl/history. We treat the job runtime as a window + // where snapshot reads may be inconsistent across schema versions. + u := fmt.Sprintf("http://%s:%d/ddl/history", host, port) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, fmt.Errorf("ddl history http %d: %s", resp.StatusCode, string(b)) + } + + var v any + if err := json.NewDecoder(resp.Body).Decode(&v); err != nil { + return nil, err + } + var windows []ddlWindow + extractDDLWindows(v, &windows) + + sort.Slice(windows, func(i, j int) bool { return windows[i].start < windows[j].start }) + return windows, nil +} + +func extractDDLWindows(v any, out *[]ddlWindow) { + switch x := v.(type) { + case map[string]any: + start := parseUint64(x["real_start_ts"]) + end := parseUint64(x["FinishedTS"]) + if start != 0 { + // For running DDL jobs, FinishedTS may be 0. Treat it as an open-ended window so + // we can conservatively skip syncpoints that may observe inconsistent snapshots. + if end == 0 { + end = ^uint64(0) + } + *out = append(*out, ddlWindow{start: start, end: end}) + } + for _, vv := range x { + extractDDLWindows(vv, out) + } + case []any: + for _, vv := range x { + extractDDLWindows(vv, out) + } + } +} + +func inDDLWindow(ts uint64, windows []ddlWindow) bool { + for _, w := range windows { + if ts > w.start && ts < w.end { + return true + } + } + return false +} + +func (r *runner) writeSyncpointDiffConfig(path string, model *clusterModel, primary, secondary uint64) error { + // Only diff stable domain tables. + var stable []string + for _, t := range model.stableTables { + stable = append(stable, fmt.Sprintf("%s.%s", t.db, t.name)) + } + sort.Strings(stable) + + var b strings.Builder + b.WriteString("# diff Configuration.\n\n") + b.WriteString("check-thread-count = 4\n\n") + b.WriteString("export-fix-sql = true\n\n") + b.WriteString("check-struct-only = false\n\n") + b.WriteString("[task]\n") + b.WriteString(fmt.Sprintf(" output-dir = %q\n\n", filepath.Join(r.cfg.Workdir, "sync_diff", fmt.Sprintf("syncpoint_%d", primary), "output"))) + b.WriteString(" source-instances = [\"upstream\"]\n\n") + b.WriteString(" target-instance = \"downstream\"\n\n") + b.WriteString(" target-check-tables = [\n") + for i, t := range stable { + sep := "," + if i == len(stable)-1 { + sep = "" + } + b.WriteString(fmt.Sprintf(" %q%s\n", t, sep)) + } + b.WriteString(" ]\n\n") + b.WriteString("[data-sources]\n") + b.WriteString("[data-sources.upstream]\n") + b.WriteString(fmt.Sprintf(" host = %q\n", r.cfg.Upstream.Host)) + b.WriteString(fmt.Sprintf(" port = %d\n", r.cfg.Upstream.Port)) + b.WriteString(fmt.Sprintf(" user = %q\n", r.cfg.Upstream.User)) + b.WriteString(fmt.Sprintf(" password = %q\n", r.cfg.Upstream.Password)) + b.WriteString(fmt.Sprintf(" snapshot = %q\n\n", fmt.Sprintf("%d", primary))) + + b.WriteString("[data-sources.downstream]\n") + b.WriteString(fmt.Sprintf(" host = %q\n", r.cfg.Downstream.Host)) + b.WriteString(fmt.Sprintf(" port = %d\n", r.cfg.Downstream.Port)) + b.WriteString(fmt.Sprintf(" user = %q\n", r.cfg.Downstream.User)) + b.WriteString(fmt.Sprintf(" password = %q\n", r.cfg.Downstream.Password)) + b.WriteString(fmt.Sprintf(" snapshot = %q\n", fmt.Sprintf("%d", secondary))) + + return os.WriteFile(path, []byte(b.String()), 0o644) +} + +type tailBuffer struct { + buf []byte + max int +} + +func newTailBuffer(maxBytes int) *tailBuffer { + return &tailBuffer{max: maxBytes} +} + +func (t *tailBuffer) Write(p []byte) (int, error) { + if t == nil || t.max <= 0 { + return len(p), nil + } + if len(p) >= t.max { + t.buf = append(t.buf[:0], p[len(p)-t.max:]...) + return len(p), nil + } + if len(t.buf)+len(p) <= t.max { + t.buf = append(t.buf, p...) + return len(p), nil + } + overflow := len(t.buf) + len(p) - t.max + t.buf = append(t.buf[overflow:], p...) + return len(p), nil +} + +func (t *tailBuffer) String() string { + if t == nil { + return "" + } + return string(t.buf) +} + +func isSkippableSyncDiffFailure(outputTail string) bool { + // sync_diff_inspector runs snapshot reads and may fail with schema-related errors when a syncpoint + // is observed during (or near) a DDL window. Treat those cases as "invalid syncpoint" and skip. + s := strings.ToLower(outputTail) + switch { + case strings.Contains(s, "unknown column"): + return true + case strings.Contains(s, "no table need to be compared"): + return true + default: + return false + } +} + +const ( + tidbEnableExternalTSReadVar = "tidb_enable_external_ts_read" + externalTSReadOffParam = tidbEnableExternalTSReadVar + "=OFF" +) + +func (r *runner) runSyncDiffInspectorWithSnapshotGuard(ctx context.Context, confPath, logPath string, retries int) (string, error) { + // sync_diff_inspector should compare only the snapshot pair from the config. + // Keep downstream external-ts reads disabled during the diff so any connection + // that misses its configured snapshot cannot fall back to a later syncpoint. + downstream, err := openMySQLWithExtraParams(ctx, r.cfg.Downstream, externalTSReadOffParam) + if err != nil { + return "", err + } + defer func() { + _ = downstream.Close() + }() + + original, err := queryGlobalExternalTSRead(ctx, downstream) + if err != nil { + return "", err + } + if err := setGlobalExternalTSRead(ctx, downstream, "OFF"); err != nil { + return "", err + } + + diag, runErr := runSyncDiffInspector(ctx, confPath, logPath, retries) + if original == "OFF" { + return diag, runErr + } + + restoreCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 10*time.Second) + restoreErr := setGlobalExternalTSRead(restoreCtx, downstream, original) + cancel() + if restoreErr != nil { + r.logger.Printf("syncpoint diff: failed to restore %s=%s: err=%v", tidbEnableExternalTSReadVar, original, restoreErr) + if runErr == nil { + return diag, restoreErr + } + } + return diag, runErr +} + +func queryGlobalExternalTSRead(ctx context.Context, downstream *sql.DB) (string, error) { + var value string + if err := downstream.QueryRowContext(ctx, "SELECT @@global."+tidbEnableExternalTSReadVar).Scan(&value); err != nil { + return "", err + } + return normalizeExternalTSReadValue(value) +} + +func setGlobalExternalTSRead(ctx context.Context, downstream *sql.DB, value string) error { + normalized, err := normalizeExternalTSReadValue(value) + if err != nil { + return err + } + _, err = downstream.ExecContext(ctx, "SET GLOBAL "+tidbEnableExternalTSReadVar+" = "+normalized) + return err +} + +func normalizeExternalTSReadValue(value string) (string, error) { + switch strings.ToUpper(strings.TrimSpace(value)) { + case "ON", "1", "TRUE": + return "ON", nil + case "OFF", "0", "FALSE": + return "OFF", nil + default: + return "", fmt.Errorf("unexpected %s value: %q", tidbEnableExternalTSReadVar, value) + } +} + +func runSyncDiffInspector(ctx context.Context, confPath, logPath string, retries int) (string, error) { + // sync_diff_inspector output can be large. Keep a tail buffer for diagnostics while + // still appending full logs to a file in the workdir. + if retries < 1 { + retries = 1 + } + + f, err := os.OpenFile(logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return "", err + } + defer f.Close() + + var lastTail string + for i := 0; i < retries; i++ { + tail := newTailBuffer(64 * 1024) + w := io.MultiWriter(f, tail) + cmd := exec.CommandContext(ctx, "sync_diff_inspector", "--log-level=debug", "--config="+confPath) + cmd.Stdout = w + cmd.Stderr = w + err = cmd.Run() + if err == nil { + return "", nil + } + lastTail = tail.String() + select { + case <-ctx.Done(): + return lastTail, ctx.Err() + case <-time.After(2 * time.Second): + } + } + return lastTail, err +} diff --git a/tests/utils/random_ddl_test_runner/workload.go b/tests/utils/random_ddl_test_runner/workload.go new file mode 100644 index 0000000000..607276627a --- /dev/null +++ b/tests/utils/random_ddl_test_runner/workload.go @@ -0,0 +1,465 @@ +package main + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "sync/atomic" + "time" +) + +func (r *runner) workload() error { + // workload runs a bounded-duration stress workload against upstream only, while verifying: + // - the changefeed remains in "normal" state, + // - checkpoint continues advancing (with auto-tuning on stalls), + // - optional snapshot diffs at TiCDC syncpoints (MySQL sink), + // - optional capture failover recovery (multi-capture failover case). + // + // After the time budget is consumed, the runner inserts a finish mark row on upstream and + // waits for it to appear on downstream as a "catch-up" barrier, then writes a final + // diff_config.toml for sync_diff_inspector. + ctx, cancel := context.WithTimeout(context.Background(), r.cfg.Duration.Duration) + defer cancel() + + if r.cfg.CDC.ChangefeedID == "" { + return fmt.Errorf("cdc.changefeed_id is required for workload phase") + } + + if err := os.MkdirAll(r.cfg.Workdir, 0o755); err != nil { + return err + } + + cfgSnapshot, _ := json.MarshalIndent(r.cfg, "", " ") + _ = os.WriteFile(filepath.Join(r.cfg.Workdir, "runner_config.snapshot.json"), cfgSnapshot, 0o644) + + r.logger.Printf("workload start: duration=%s seed=%d changefeed=%s sink=%s", + r.cfg.Duration.Duration, r.cfg.Seed, r.cfg.CDC.ChangefeedID, r.cfg.SinkType) + + up, err := openMySQL(ctx, r.cfg.Upstream) + if err != nil { + return err + } + defer func() { _ = up.Close() }() + + down, err := openMySQL(ctx, r.cfg.Downstream) + if err != nil { + return err + } + defer func() { _ = down.Close() }() + + model := buildInitialModel(r.cfg) + + // Initialize frozen rows for the motif family (t03). + for _, t := range model.tables { + if !t.isMotif { + continue + } + t.mu.Lock() + for i := int64(1); i <= int64(r.cfg.Bootstrap.FrozenRowsPerTable); i++ { + t.frozen[i] = struct{}{} + } + t.mu.Unlock() + } + + trace, err := newDDLTrace(r.cfg.Workdir) + if err != nil { + return err + } + defer trace.close() + + var ( + activeDMLWorkers int32 = int32(r.cfg.DML.InitialWorkers) + activeDDLWorkers int32 = int32(r.cfg.DDL.InitialWorkers) + motifStep int32 = 0 + syncpointChecked int32 = 0 + ) + + dmlCounters := &dmlCounters{} + ddlSelector := newDDLSelector(defaultDDLKinds(), 200) + + errCh := make(chan error, 1) + var wg sync.WaitGroup + + // DML workers. + for i := 0; i < r.cfg.DML.MaxWorkers; i++ { + wg.Add(1) + go func(workerID int) { + defer wg.Done() + dmlWorker(ctx, up, model, r.cfg.Seed+10_000, workerID, &activeDMLWorkers, r.cfg.DML, dmlCounters, &motifStep) + }(i) + } + + // DDL workers. + for i := 0; i < r.cfg.DDL.MaxWorkers; i++ { + wg.Add(1) + go func(workerID int) { + defer wg.Done() + ddlWorker(ctx, up, model, r.cfg.Seed+20_000, workerID, &activeDDLWorkers, ddlSelector, trace, r.logger) + }(i) + } + + // Primary motif controller. + wg.Add(1) + go func() { + defer wg.Done() + runPrimaryMotif(ctx, up, model, &motifStep, trace, r.logger, r.cfg.Profile) + }() + + // Health monitor + auto-tune. + wg.Add(1) + go func() { + defer wg.Done() + if err := r.healthAndAutotuneLoop(ctx, dmlCounters, &activeDMLWorkers, &activeDDLWorkers); err != nil { + select { + case errCh <- err: + default: + } + cancel() + } + }() + + // Big transactions to stress large commit paths (optional, enabled by default). + if r.cfg.DML.BigTxnEnabled { + wg.Add(1) + go func() { + defer wg.Done() + bigTxnWorker(ctx, up, model, r.cfg.Seed+40_000, r.cfg.DML, &activeDMLWorkers) + }() + } + + // Key conflict writer (optional, enabled by default). + if r.cfg.DML.KeyConflictEnabled { + wg.Add(1) + go func() { + defer wg.Done() + conflictWriter(ctx, up, model, r.cfg.Seed+50_000, r.cfg.DML, dmlCounters) + }() + } + + // MySQL syncpoint diff controller (optional). + if r.cfg.SinkType == "mysql" && r.cfg.MySQL.Enabled { + wg.Add(1) + go func() { + defer wg.Done() + if err := r.syncpointDiffLoop(ctx, up, down, model, trace, &syncpointChecked); err != nil { + select { + case errCh <- err: + default: + } + cancel() + } + }() + } + + // Failover controller (optional). + if r.cfg.Failover.Enabled && len(r.cfg.Failover.CaptureAddrs) > 0 { + wg.Add(1) + go func() { + defer wg.Done() + if err := r.failoverLoop(ctx, &motifStep, trace); err != nil { + select { + case errCh <- err: + default: + } + cancel() + } + }() + } + + <-ctx.Done() + wg.Wait() + + select { + case err := <-errCh: + return err + default: + } + + r.logger.Printf("workload finished, waiting for converge: %s", r.cfg.Verify.ConvergeWait.Duration) + time.Sleep(r.cfg.Verify.ConvergeWait.Duration) + + convergeCtx, convergeCancel := context.WithTimeout(context.Background(), r.cfg.Verify.ConvergeTimeout.Duration) + defer convergeCancel() + + if err := r.createAndWaitFinishMark(convergeCtx, up, down, model); err != nil { + return err + } + + if r.cfg.SinkType == "mysql" && r.cfg.MySQL.Enabled && r.cfg.MySQL.MaxDiffChecks > 0 { + need := r.cfg.MySQL.MaxDiffChecks - int(atomic.LoadInt32(&syncpointChecked)) + if need > 0 { + r.logger.Printf("syncpoint diff: catching up after workload, need=%d", need) + diffCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + if err := r.ensureSyncpointDiffAfterWorkload(diffCtx, down, model, need); err != nil { + return err + } + } + } + + if err := r.writeDMLStats(dmlCounters); err != nil { + return err + } + + // The workload context is already done here. Use a short-lived context for final verification steps. + diffCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := r.writeFinalDiffConfig(diffCtx, up, model); err != nil { + return err + } + + if r.cfg.Verify.LogScanEnabled { + if err := scanLogsForPatterns(r.cfg.Workdir, r.cfg.Verify.PanicPatterns, r.cfg.Verify.FailOnPanicMatch, r.logger); err != nil { + return err + } + } + + return nil +} + +func (r *runner) healthAndAutotuneLoop( + ctx context.Context, + dmlCounters *dmlCounters, + activeDMLWorkers *int32, + activeDDLWorkers *int32, +) error { + // This loop is the guardrail for the stress test: + // - It continuously checks changefeed state and checkpoint progress. + // - It degrades worker concurrency on stalls or low DML success rate to help recovery. + // - It fails the run if checkpoint does not advance for NoAdvanceHard. + ticker := time.NewTicker(r.cfg.Verify.HealthInterval.Duration) + defer ticker.Stop() + + var ( + lastCheckpoint uint64 + lastAdvance = time.Now() + prevSnap = dmlCounters.snapshot() + ) + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + } + + st, err := r.getChangefeedStatus(ctx) + if err != nil { + return err + } + if st.State != "normal" { + return fmt.Errorf("changefeed state is not normal: %s", st.State) + } + now := time.Now() + if lastCheckpoint == 0 { + lastCheckpoint = st.Checkpoint + lastAdvance = now + } else if st.Checkpoint != 0 && st.Checkpoint != lastCheckpoint { + lastCheckpoint = st.Checkpoint + lastAdvance = now + } + + sinceAdvance := now.Sub(lastAdvance) + snap := dmlCounters.snapshot() + intervalTotal := snap.Total - prevSnap.Total + intervalSuccess := snap.Success - prevSnap.Success + prevSnap = snap + + successRate := 1.0 + if intervalTotal > 0 { + successRate = float64(intervalSuccess) / float64(intervalTotal) + } + + r.logger.Printf("health: state=%s checkpoint=%d since_advance=%s dml_total=%d dml_success=%d success_rate=%.3f active_dml=%d active_ddl=%d", + st.State, st.Checkpoint, sinceAdvance, intervalTotal, intervalSuccess, successRate, + atomic.LoadInt32(activeDMLWorkers), atomic.LoadInt32(activeDDLWorkers)) + + res := autoTuneStep( + sinceAdvance, + successRate, + atomic.LoadInt32(activeDMLWorkers), + atomic.LoadInt32(activeDDLWorkers), + int32(r.cfg.DML.MaxWorkers), + int32(r.cfg.DDL.MaxWorkers), + r.cfg.Verify.NoAdvanceSoft.Duration, + r.cfg.Verify.NoAdvanceHard.Duration, + ) + if res.fail { + return fmt.Errorf("checkpoint did not advance for %s (hard=%s)", sinceAdvance, r.cfg.Verify.NoAdvanceHard.Duration) + } + atomic.StoreInt32(activeDMLWorkers, res.nextDML) + atomic.StoreInt32(activeDDLWorkers, res.nextDDL) + } +} + +func (r *runner) writeDMLStats(counters *dmlCounters) error { + snap := counters.snapshot() + b, err := json.MarshalIndent(snap, "", " ") + if err != nil { + return err + } + return writeFileAtomic(filepath.Join(r.cfg.Workdir, "dml_stats.json"), b) +} + +func (r *runner) writeFinalDiffConfig(ctx context.Context, up *sql.DB, model *clusterModel) error { + // Write a TOML-like config by template to avoid adding new dependencies. + // + // Use actual upstream table existence to build the final diff table list. + // This avoids sync_diff_inspector init failures when churn-domain tables are dropped, + // recovered to a different name, or partially modified due to concurrent DDL. + tables, err := listExistingBaseTables(ctx, up, model.dbs) + if err != nil { + // Fall back to model state when upstream introspection fails. + // This should be rare and keeps the runner resilient to transient DB issues. + r.logger.Printf("final diff: failed to list upstream tables, falling back to model state: err=%v", err) + for _, t := range model.tables { + t.mu.Lock() + exists := t.exists + dbName := t.db + tableName := t.name + t.mu.Unlock() + if !exists { + continue + } + tables = append(tables, fmt.Sprintf("%s.%s", dbName, tableName)) + } + sort.Strings(tables) + } + + var b strings.Builder + b.WriteString("# diff Configuration.\n\n") + b.WriteString("check-thread-count = 4\n\n") + b.WriteString("export-fix-sql = true\n\n") + b.WriteString("check-struct-only = false\n\n") + b.WriteString("[task]\n") + b.WriteString(fmt.Sprintf(" output-dir = %q\n\n", filepath.Join(r.cfg.Workdir, "sync_diff", "output"))) + b.WriteString(" source-instances = [\"upstream\"]\n\n") + b.WriteString(" target-instance = \"downstream\"\n\n") + b.WriteString(" target-check-tables = [\n") + for i, t := range tables { + sep := "," + if i == len(tables)-1 { + sep = "" + } + b.WriteString(fmt.Sprintf(" %q%s\n", t, sep)) + } + b.WriteString(" ]\n\n") + b.WriteString("[data-sources]\n") + b.WriteString("[data-sources.upstream]\n") + b.WriteString(fmt.Sprintf(" host = %q\n", r.cfg.Upstream.Host)) + b.WriteString(fmt.Sprintf(" port = %d\n", r.cfg.Upstream.Port)) + b.WriteString(fmt.Sprintf(" user = %q\n", r.cfg.Upstream.User)) + b.WriteString(fmt.Sprintf(" password = %q\n\n", r.cfg.Upstream.Password)) + + b.WriteString("[data-sources.downstream]\n") + b.WriteString(fmt.Sprintf(" host = %q\n", r.cfg.Downstream.Host)) + b.WriteString(fmt.Sprintf(" port = %d\n", r.cfg.Downstream.Port)) + b.WriteString(fmt.Sprintf(" user = %q\n", r.cfg.Downstream.User)) + b.WriteString(fmt.Sprintf(" password = %q\n", r.cfg.Downstream.Password)) + + return os.WriteFile(filepath.Join(r.cfg.Workdir, "diff_config.toml"), []byte(b.String()), 0o644) +} + +func listExistingBaseTables(ctx context.Context, db *sql.DB, dbs []string) ([]string, error) { + var tables []string + for _, dbName := range dbs { + // dbName is generated by the runner and should be safe to embed in a quoted identifier. + q := fmt.Sprintf("SHOW FULL TABLES IN `%s` WHERE Table_Type = 'BASE TABLE';", dbName) + rows, err := db.QueryContext(ctx, q) + if err != nil { + return nil, err + } + for rows.Next() { + var tblName string + var tblType string + if err := rows.Scan(&tblName, &tblType); err != nil { + _ = rows.Close() + return nil, err + } + tables = append(tables, fmt.Sprintf("%s.%s", dbName, tblName)) + } + if err := rows.Err(); err != nil { + _ = rows.Close() + return nil, err + } + _ = rows.Close() + } + sort.Strings(tables) + return tables, nil +} + +func (r *runner) createAndWaitFinishMark(ctx context.Context, up, down *sql.DB, model *clusterModel) error { + // The finish mark is a replication barrier: the workload is already stopped, but the + // sink / consumer may still be draining. Waiting for the finish mark to appear on + // downstream provides a deterministic "catch up" point before running the final diff. + if len(model.dbs) == 0 { + return fmt.Errorf("no databases in model") + } + + markerDB := model.dbs[0] + const markerTable = "finish_mark" + const markerID int64 = 1 + markerValue := r.cfg.Seed + + r.logger.Printf("converge: creating finish mark table: db=%s table=%s id=%d value=%d", + markerDB, markerTable, markerID, markerValue) + + createSQL := fmt.Sprintf("CREATE TABLE IF NOT EXISTS `%s`.`%s` (`id` BIGINT PRIMARY KEY, `v` BIGINT NOT NULL)", + markerDB, markerTable) + if _, err := up.ExecContext(ctx, createSQL); err != nil { + return err + } + insertSQL := fmt.Sprintf("REPLACE INTO `%s`.`%s` (`id`, `v`) VALUES (?, ?)", markerDB, markerTable) + if _, err := up.ExecContext(ctx, insertSQL, markerID, markerValue); err != nil { + return err + } + + r.logger.Printf("converge: waiting for finish mark to appear in downstream") + + pollTicker := time.NewTicker(2 * time.Second) + defer pollTicker.Stop() + healthTicker := time.NewTicker(r.cfg.Verify.HealthInterval.Duration) + defer healthTicker.Stop() + + for { + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-healthTicker.C: + st, err := r.getChangefeedStatus(ctx) + if err != nil { + return err + } + if st.State != "normal" { + return fmt.Errorf("changefeed state is not normal: %s", st.State) + } + r.logger.Printf("converge: waiting for finish mark, checkpoint=%d", st.Checkpoint) + case <-pollTicker.C: + var got int64 + q := fmt.Sprintf("SELECT `v` FROM `%s`.`%s` WHERE `id` = ?", markerDB, markerTable) + err := down.QueryRowContext(ctx, q, markerID).Scan(&got) + if err == nil { + if got == markerValue { + r.logger.Printf("converge done: finish mark applied downstream") + return nil + } + r.logger.Printf("converge: finish mark row value mismatch: got=%d want=%d", got, markerValue) + continue + } + if err == sql.ErrNoRows { + continue + } + // Table may not exist yet on MQ sinks. + if strings.Contains(err.Error(), "doesn't exist") { + continue + } + return err + } + } +} From fbc7dfc0ad63fdc681efa8c51afef79b1ee2f84a Mon Sep 17 00:00:00 2001 From: dongmen <414110582@qq.com> Date: Tue, 16 Jun 2026 15:39:30 +0800 Subject: [PATCH 2/3] remove redundant files Signed-off-by: dongmen <414110582@qq.com> --- .issue/weekly_rand_single_failure_analysis.md | 747 ------------------ .issue/weekly_rand_single_notebook.md | 345 -------- 2 files changed, 1092 deletions(-) delete mode 100644 .issue/weekly_rand_single_failure_analysis.md delete mode 100644 .issue/weekly_rand_single_notebook.md diff --git a/.issue/weekly_rand_single_failure_analysis.md b/.issue/weekly_rand_single_failure_analysis.md deleted file mode 100644 index ef73317889..0000000000 --- a/.issue/weekly_rand_single_failure_analysis.md +++ /dev/null @@ -1,747 +0,0 @@ -# weekly_rand_single 失败分析与修复计划 - -## 目标 - -修复 `/tmp/tidb_cdc_test/weekly_rand_single` case 在收敛阶段超时的问题,并用回归测试和连续 5 次 case 通过确认修复有效。 - -当前修复分支:`fix-weekly-rand-single-ddl-progress`,基于 `0115-ddl-test` 的 `0ffe03f83`。 - -## 现象 - -失败不是 sync-diff 数据不一致,而是 workload 结束后等待 finish mark 同步到下游超时。 - -关键日志: - -- `runner.log:2944`:`syncpoint diff` 成功,`primary_ts=467013528453120000`。 -- `runner.log:5122`:workload finished,开始等待收敛。 -- `runner.log:5123-5124`:上游创建 `db1.finish_mark` 并等待下游出现。 -- `runner.log:5205-5303`:checkpoint 卡在 `467013630689280000`。 -- `runner.log:5304`:`runner failed: context deadline exceeded`。 - -TSO 换算(Asia/Shanghai): - -- `467013630689280000` = `2026-06-15 17:27:00.000 +08:00`。 -- `467013636614783103` = `2026-06-15 17:27:22.604 +08:00`。 -- `467013638553600000` = `2026-06-15 17:27:30.000 +08:00`。 -- `467013645501726800` = `2026-06-15 17:27:56.505 +08:00`。 -- `467013645501726844` = `2026-06-15 17:27:56.505 +08:00`。 - -## 已确认的卡点 - -CDC 日志显示 barrier 没有完成 coverage,导致 maintainer 不能把 global checkpoint 推过早期 DDL/syncpoint barrier: - -- DDL barrier:`commitTs=467013636614783103`,query 为 `CREATE INDEX idx_d_6619 ON db1.t08(d)`。 -- 该 DDL 覆盖物理分区表 `142, 143, 144, 145`。 -- `142/143/144` 都有 `dispatcher receive ddl event` 和 ack。 -- `145` 没有对应的 accepted DDL 日志,maintainer 报 `reported count: 3, require count: 4, uncovered tables: 145`。 -- syncpoint barrier:`commitTs=467013638553600000`。 -- 后续 syncpoint 又显示 `uncovered tables: 142, 143, 144, 145`,本质上是前面的 DDL barrier 没过,导致后面的 barrier 继续被挡住。 -- `maintainer.go` 反复选择 `newCheckpointTs=467013630689279999`,runner 侧看到的 checkpoint 为 `467013630689280000`。 - -同时,event scanner 存在一个明确的无进展循环: - -- `tableID=2669` 对应旧物理表 `db3.t10_r_7179892`。 -- TiDB DDL 在 `2026-06-15 17:27:56 +08:00` 对该表执行 `TRUNCATE TABLE db3.t10_r_7179892`。 -- schema store 的删除版本为 `deleteVersion=467013645501726800`。 -- scanner 反复请求 `GetTableInfo(tableID=2669, ts=467013645501726844)`。 -- 因为 `ts >= deleteVersion`,`multi_version.go` 返回 `TableDeletedError`。 -- `event_scanner.go` 把该错误转换成 `nil, nil`,随后以 `rawEvent.CRTs-1` 调用 `finalizeScan`。 - -问题在于这次原始事件的 `rawEvent.CRTs-1` 正好等于当前扫描起点;scanner 发送的 resolved event 没有推进水位。下一轮仍然从同一个起点扫描到同一条 raw event,然后再次遇到 `TableDeletedError`,形成死循环。 - -相关代码: - -- `logservice/schemastore/multi_version.go`:`getTableInfo` 在 `ts >= deleteVersion` 时返回 `TableDeletedError`。 -- `pkg/eventservice/event_scanner.go`:`scanAndMergeEvents` 在 `tableInfo == nil` 时调用 `finalizeScan(..., rawEvent.CRTs-1)`。 -- `logservice/eventstore/event_store.go`:iterator 扫描范围是 `(CommitTsStart, CommitTsEnd]`,所以 resolved ts 必须严格大于旧的 `CommitTsStart` 才能跳过当前 raw event。 - -table 145 的 DDL 缺失与 reset/replay 状态有关: - -- `cdc-2026-06-15T17-51-16.335.log:726193`:dispatcher `1774578769225496409714574658290184438691` 收到 reset,`epoch=7`,`resetTs=467013575639039999`。 -- 同一 DDL 的旧 epoch 事件随后被正确忽略:`eventEpoch=6`,`dispatcherEpoch=7`。 -- `cdc-2026-06-15T17-52-19.370.log:121963`:event service 已经向 table 145 dispatcher 重发 DDL,`commitTs=467013636614783103`,`seq=1325`。 -- 但 downstream dispatcher 没有记录 `dispatcher receive ddl event`,说明事件在进入 dispatcher 前被 eventcollector 的状态过滤或 reset 后状态不一致挡住。 -- `downstreamadapter/eventcollector/dispatcher_stat.go` 原来在 `advanceEpochForReset` 只切换 epoch/maxEventTs,没有把 `lastEventCommitTs` 和同 ts DDL/SyncPoint 去重标志回到 resetTs。 -- reset 的语义是从 `resetTs` 重新 replay 新 epoch 事件;如果旧 epoch 已经把 `lastEventCommitTs` 推到更大值,新 epoch 中位于 `(resetTs, oldLastEventCommitTs)` 的 replay DDL 会被当成旧事件过滤,table 145 就不会向 maintainer 上报 DDL barrier。 - -## 根因判断 - -目前确认有三个会阻塞 checkpoint 前进的问题: - -1. 直接卡住原始 case 的问题是 eventcollector reset 后没有同步重置 commitTs 去重状态。table 145 reset 到新 epoch 后需要 replay `467013636614783103` 的 DDL,但旧 epoch 的 `lastEventCommitTs` 可能已经更大,导致 replay DDL 在进入 downstream dispatcher 前被过滤。maintainer 因此一直等不到 table 145 的 DDL barrier report。 - -2. event scanner 还有一个独立的无进展问题:deleted table raw event 被跳过时 resolved ts 仍可能停在 `rawEvent.CRTs-1`,而该值等于当前 scan start 时,下一轮会再次读到同一条 raw event。 - -对于已经删除或 truncate 后的旧物理表,遇到无法取到 table info 的 raw event 时,scanner 不能继续把 resolved ts 固定在 `rawEvent.CRTs-1`。如果该值等于本轮 `CommitTsStart`,event broker 下一轮仍会在 `(CommitTsStart, CommitTsEnd]` 内看到同一条 event,导致 dispatcher 无法完成对应 DDL/syncpoint coverage,最终 changefeed checkpoint 卡住,finish mark 永远不能同步到下游。 - -3. 第一次带前两个修复重新跑 case 后,原 table 145 DDL 卡点没有复现,但暴露出同一 syncpoint barrier 被迟到 WAITING 状态重建后的覆盖丢失问题。第一次 `commitTs=467018333552640000` 的 syncpoint barrier 已经完成并从 `blockedEvents` 删除;随后迟到 WAITING 又创建了第二个同 ts syncpoint barrier。第二个 barrier 通过 checkpoint-forward 直接进入 selected/pass 阶段,但新的 range checker 没有继承第一次 barrier 中已经 DONE 的 span block state,导致 table 299 永久显示 uncovered,checkpoint 卡住超过 5 分钟。 - -这个问题不能通过简单忽略迟到 WAITING 解决,因为迟到或重启后的 dispatcher 仍可能需要收到 Pass 才能解除本地 block。正确处理方式是:当 barrier 因 checkpoint-forward 进入 selected 阶段时,和正常 writer 选择路径一样重置 DONE 阶段进度,并把当前 spanController 中已严格越过 barrier、或同一 `(commitTs, isSyncPoint)` 已经上报 DONE 的 replication 计入新的 range checker。 - -## 修复计划 - -1. 在 `pkg/eventservice/event_scanner.go` 中修改 deleted table 分支。 - - 当前行为:`finalizeScan(..., rawEvent.CRTs-1)`。 - - 目标行为:对 `TableDeletedError` 造成的 `tableInfo == nil`,跳过当前 raw event 所在 commit ts,并用 `rawEvent.CRTs` 作为 resolved ts。 - - 这样下一轮 iterator 的 `(CommitTsStart, CommitTsEnd]` 不会再包含当前 raw event。 - -2. 在 `downstreamadapter/eventcollector/dispatcher_stat.go` 中修改 reset 状态。 - - 当前行为:`advanceEpochForReset` 只切换 epoch 和 `maxEventTs`。 - - 目标行为:成功进入新 epoch 时,把 `lastEventCommitTs` 重置为 `resetTs`,并清掉 `gotDDLOnTs` / `gotSyncpointOnTS`。 - - 这样 reset replay 的 DDL/SyncPoint 不会被旧 epoch 的 commitTs 去重状态过滤。 - -3. 增加聚焦回归测试。 - - 文件:`pkg/eventservice/event_scanner_test.go`。 - - 场景:mock schema store 返回 `TableDeletedError`,scan range 的 `CommitTsStart` 设置为 `rawEvent.CRTs-1`。 - - 断言:scanner 不产生 DML,返回 resolved event,并且 resolved ts 等于 `rawEvent.CRTs` 且严格大于 scan start。 - - 文件:`downstreamadapter/eventcollector/dispatcher_stat_test.go`。 - - 场景:旧 epoch 的 `lastEventCommitTs=220`,reset 到 `150`,新 epoch handshake 后 replay `180` 的 DDL。 - - 断言:reset 后 commitTs 状态回到 `150`,并且 `180` 的 DDL 可以被转发到 dispatcher。 - -4. 在 `maintainer/barrier_event.go` 中修复 checkpoint-forward selected 路径。 - - 当前行为:`checkBlockedDispatchers` 发现某个 replication 已越过 barrier 后,只设置 `selected=true` 和 `writerDispatcherAdvanced=true`。 - - 目标行为:进入 selected 阶段时统一调用重置逻辑,重建或清空 range checker,并把已越过当前 barrier 的 replication 加入 DONE 阶段覆盖。 - - `forwardBarrierEvent` 保持 `checkpointTs > commitTs` 的严格判断,同时新增同一 `(commitTs, isSyncPoint)` 且 `Stage_DONE` 的判断,避免把 `checkpointTs == commitTs` 误认为 syncpoint 已经 flush。 - -5. 增加 maintainer 回归测试。 - - 文件:`maintainer/barrier_test.go`。 - - 场景:同一 syncpoint barrier 先正常完成并删除;迟到 WAITING 重建同 ts barrier;一个 dispatcher checkpoint-forward 触发 selected。 - - 断言:旧 barrier 中已 DONE 的 dispatcher 会计入新 range checker,重建的 barrier 可以立即完成,不会留下永久 uncovered table。 - -6. 本地验证顺序。 - - 先跑 `go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1`。 - - 再跑 `go test ./downstreamadapter/eventcollector -run TestAdvanceEpochForResetClearsCommitTsFilter -count=1`。 - - 再跑 `go test ./maintainer -run TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers -count=1`。 - - 再分别跑 `go test ./pkg/eventservice -count=1`、`go test ./downstreamadapter/eventcollector -count=1`、`go test ./maintainer -count=1`。 - - 如编译或测试失败,先修复单测/逻辑再扩大验证。 - -7. case 验证。 - - 重新构建需要的 `cdc` binary。 - - 运行 `weekly_rand_single` case。 - - 连续通过 5 次才认为成功。 - -8. 独立审查。 - - 修复完成后让 reviewer subagent 检查 diff、回归测试和剩余风险。 - - 若发现阻塞问题,修复后重新跑相关测试。 - -## 调查记录 - -- 已读取 TiCDC event-broker / schema-store 相关代码路径。 -- 已确认 `runner.log` 的失败点为收敛超时,不是 sync-diff mismatch。 -- 已确认 event store 迭代器扫描范围是 `(CommitTsStart, CommitTsEnd]`。 -- 已确认 `TableDeletedError` 目前只在 `getTableInfo4Txn` 单元测试里覆盖,没有覆盖 scanner 水位是否前进。 - -## 已实施修复 - -- `pkg/eventservice/event_scanner.go` - - deleted-table 分支从 `finalizeScan(..., rawEvent.CRTs-1)` 改为 `finalizeScan(..., rawEvent.CRTs)`。 - - 目的:让当前无法解码的 post-delete raw event 被排除在下一轮 `(CommitTsStart, CommitTsEnd]` 之外。 - -- `pkg/eventservice/event_scanner_test.go` - - 更新 `TestEventScannerWithDeleteTable` 的预期:删除后的第一条 raw event 被跳过后,resolved ts 前进到该 raw event 的 `CRTs`。 - - 新增 `TestScanAndMergeEventsSkipsDeletedTableTxn`,直接覆盖 `TableDeletedError` 且 `CommitTsStart == rawEvent.CRTs-1` 的无进展场景。 - -- `downstreamadapter/eventcollector/dispatcher_stat.go` - - `advanceEpochForReset` 成功切换到新 epoch 后,把 `lastEventCommitTs` 重置为 `resetTs`。 - - 同时清理 `gotDDLOnTs` 和 `gotSyncpointOnTS`。 - - 目的:reset 后从 `resetTs` replay,新 epoch 的 DDL/SyncPoint 不能被旧 epoch 的 commitTs 去重状态过滤。 - -- `maintainer/barrier_event.go` - - 新增 selected 阶段重置逻辑:确保 range checker 存在,重置 DONE 阶段 coverage,并把已越过当前 barrier 的 replication 计入 coverage。 - - `checkBlockedDispatchers` 的 checkpoint-forward 路径不再只设置 `selected/writerDispatcherAdvanced`,而是走同一 selected 阶段初始化逻辑。 - - `forwardBarrierEvent` 新增同一 `(BlockTs, IsSyncPoint)` 且 `Stage_DONE` 的判断。 - - 保留 `checkpointTs > commitTs` 的严格条件,不使用 `>=`,避免 dispatcher 以 `startTs == commitTs` 重建时跳过仍需 flush 的 syncpoint。 - - 针对 Normal DROP barrier 的迟到 WAITING 重建,新增缺失 dropped table 覆盖逻辑。 - - 仅当 `BlockedTables.InfluenceType == Normal`、`NeedDroppedTables.InfluenceType == Normal`、tableID 属于 `NeedDroppedTables` 且 `spanController` 中已无该 table task 时,才把该 tableID 标记为覆盖。 - - 目的:table dispatcher 已因先前完成的 DROP/TRUNCATE 调度被删除后,迟到重建的 barrier 不再永久等待已删除的 dispatcher;事件进入 selected/pass 阶段后仍会给 DDL span 发送 `Action_Pass`,不会重复执行 `Action_Write`。 - -- `maintainer/barrier_test.go` - - 新增 `TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers`。 - - 覆盖 syncpoint barrier 完成删除后被迟到 WAITING 重建、再由 checkpoint-forward 进入 selected 的场景。 - - 断言同 ts 已 DONE 的 dispatcher 会被计入重建后的 range checker,barrier 不会永久等待旧 DONE 状态。 - - 新增 `TestForwardBarrierEventBoundaries`,覆盖 `checkpointTs == commitTs` 不推进、`checkpointTs > commitTs` 推进、同 ts syncpoint/DDL DONE 与 WAITING 的顺序边界。 - - 新增 `TestNormalBarrierRecreatedAfterDroppedTableRemoved`。 - - 覆盖 DROP TABLE barrier 已完成并删除 table dispatcher 后,DDL dispatcher 迟到 WAITING 重建同一 barrier 的场景。 - - 新增 `TestNormalBarrierDoesNotCoverMissingNonDroppedTable`,确认非 drop Normal barrier 不会因为 table task 缺失而被误推进。 - -- `downstreamadapter/eventcollector/dispatcher_stat_test.go` - - 新增 `TestAdvanceEpochForResetClearsCommitTsFilter`。 - - 覆盖旧状态已推进到 `220`、reset 到 `150`、新 epoch replay `180` DDL 的场景。 - -## 验证记录 - -## 新增失败:Normal DDL 迟到 WAITING 重建 - -- 5 连跑的第 1 次 attempt(seed `2026061509`)中,workload 已结束并进入 converge,但 finish mark 长时间未同步到下游。 -- `runner.log` 从 `2026/06/15 15:38:08` 开始等待 finish mark;checkpoint 最终卡在 `467019061784216256`。 -- maintainer 日志反复报告普通 DROP TABLE barrier `467019061784216242` 未 resolved:`reported count: 1, require count: 2, uncovered tables: 267`,blocked tables 为 `[267,0]`。 -- 同一 commitTs 的前序日志显示 `2026/06/15 23:40:47.934 +08:00` 已经 `all dispatchers reported event done, remove event`;随后 `2026/06/15 23:40:48.172 +08:00` table 267 dispatcher 被 remove/unregister。 -- 但 `2026/06/15 23:40:48.324 +08:00` DDL dispatcher 才收到同一个 `DROP TABLE db3.t10` 并上报 WAITING,maintainer 因已删除旧 event 而重新创建 barrier。 -- 根因:`blockedEvents` 不保留已完成事件历史;迟到 WAITING 重建 Normal DDL barrier 后,`checkBlockedDispatchers` 只检查仍存在的相关 replication 是否已 forward,未把已经删除的 dropped table 视为完成,导致永远等待 table 267。 -- 处理:中断该 doomed attempt,补充 Normal dropped table 缺失覆盖逻辑后重新验证。 - -- `go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` - - 结果:失败,原因是远端默认 `go` 为 1.25.3,`go.mod` 要求 `>=1.25.10` 且 `GOTOOLCHAIN=local`。 - -- `GOTOOLCHAIN=auto go test ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` - - 结果:失败,原因是 TiDB testkit 要求 `--tags=intest`。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -run TestScanAndMergeEventsSkipsDeletedTableTxn -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -run "TestScanAndMergeEventsSkipsDeletedTableTxn|TestEventScannerWithDeleteTable" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run TestAdvanceEpochForResetClearsCommitTsFilter -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run "TestAdvanceEpochForResetClearsCommitTsFilter|TestCheckpointTsForEventServiceUsesCollectorObservedMaxTs|TestFilterAndUpdateEventByCommitTs|TestHandleSingleDataEventsUpdatesDDLStateAndDedupsSameTsDDL|TestHandleSignalEvent|TestGroupHeartbeatResetThenHandshake" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` - - 结果:再次通过。 - -- `git diff --check -- pkg/eventservice/event_scanner.go pkg/eventservice/event_scanner_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/dispatcher_stat_test.go .issue/weekly_rand_single_failure_analysis.md` - - 结果:通过。 - -- reviewer subagent:event scanner 修复审查 - - 结果:未发现阻塞问题。 - - 结论:`finalizeScan(..., rawEvent.CRTs)` 只应用于 table 已在 `rawEvent.CRTs-1` 不存在的情况,不会丢弃同 commit ts 下仍可用旧 schema 解码的 DML。 - -- reviewer subagent:eventcollector reset 修复审查 - - 结果:未发现阻塞问题。 - - 结论:旧 epoch 事件仍由 epoch 过滤拦截;新 epoch 仍要求 handshake/seq;heartbeat 仍受 `maxEventTs` 限制;reset 清理 commitTs flags 只避免旧 epoch 状态误杀新 epoch replay。 - -- `GOTOOLCHAIN=auto make integration_test_build_fast` - - 结果:通过。 - -- 第一次重新运行 `weekly_rand_single` - - 命令:`GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=2026061509 tests/integration_tests/run.sh mysql weekly_rand_single`。 - - 结果:失败;原 table 145 DDL 缺失问题没有复现,新的卡点为 syncpoint `467018333552640000`。 - - 失败摘要:runner 报 `checkpoint did not advance for 5m9.912642314s (hard=5m0s)`,maintainer 报 `active_ddl=1`。 - - 关键 barrier 日志:第二个同 ts syncpoint barrier 的 coverage 为 `reported count: 184, require count: 185, uncovered tables: 299`。 - - 定位结论:第一次 barrier 完成删除后,迟到 WAITING 重建第二个 barrier;table 299 已在第一次 barrier 上报 DONE,但第二个 barrier 的 range checker 未继承该状态。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` - - 结果:通过。 - -- `git diff --check -- pkg/eventservice/event_scanner.go pkg/eventservice/event_scanner_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/dispatcher_stat_test.go maintainer/barrier_event.go maintainer/barrier_test.go .issue/weekly_rand_single_failure_analysis.md` - - 结果:通过。 - -- reviewer subagent:maintainer barrier 修复审查 - - 结果:未发现阻塞问题。 - - 结论:checkpoint-forward 进入 selected 时会重建/重置 range checker,并把 `checkpointTs > commitTs` 或同一 barrier 已 DONE 的 dispatcher 计入覆盖。 - - 边界确认:没有放宽 `checkpointTs == commitTs`;同 ts DDL DONE 不会误判 syncpoint 已完成;同 ts syncpoint 状态推进 DDL barrier 仍符合 `(commitTs, isSyncPoint)` 顺序。 - - 建议:补充 `forwardBarrierEvent` 边界单测;已补 `TestForwardBarrierEventBoundaries`。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestForwardBarrierEventBoundaries|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` - - 结果:再次通过。 - -- 第一次 5 连跑 attempt 1(seed `2026061509`) - - 命令:`GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=2026061509 tests/integration_tests/run.sh mysql weekly_rand_single`(5 seed loop 的第 1 次)。 - - 结果:中断;已确定会卡在 Normal DROP TABLE barrier `467019061784216242`,未继续等待到 runner 自身超时。 - - 新根因:迟到 WAITING 重建已完成的 Normal DROP barrier,table 267 dispatcher 已被删除后无法再次上报。 - -- explorer subagent:Normal DROP barrier 迟到重建修复边界 - - 结果:确认主线方向正确,但必须限制为 `NeedDroppedTables` 中且已无 task 的 Normal tableID。 - - 结论:进入 selected/pass 阶段后应发送 `Action_Pass`,不能重新 `Action_Write`;非 drop Normal barrier 不能因为 table 缺失被覆盖。 - - 已按建议收紧实现并补负向单测。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers|TestForwardBarrierEventBoundaries" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` - - 结果:通过。 - -## 新增失败:ReleasePath 清掉 blocked dispatcher 的后续 barrier 事件 - -- 第二次 5 连跑 attempt 1(seed `2026061509`)失败在 syncpoint `467019914280960000`。 -- runner 报错:`checkpoint did not advance for 5m9.924647605s (hard=5m0s)`。 -- maintainer coverage:`reported count: 193, require count: 194, uncovered tables: 164`。 -- table 164 dispatcher ID:`106834843083412973892104354966216139137`。 - -关键时间线: - -- `2026-06-16 00:10:21.030 +08:00`:event_broker 向 table 164 发送 DDL `CREATE INDEX idx_ts_1705 ON db1.t16(ts)`,`commitTs=467019911350976624`,`seq=5`。 -- `2026-06-16 00:10:21.037 +08:00`:event_broker 随后发送目标 syncpoint `467019914280960000`,`seq=6`。 -- `2026-06-16 00:14:46.480 +08:00`:eventcollector memory control 对 table 164 所在 path 执行 `ReleasePath`,`releasedSize=6652`。 -- `2026-06-16 00:15:09.168 +08:00`:table 164 的 DDL seq=5 才收到 maintainer pass 并处理完成,耗时 `38.62552003s`。 -- `2026-06-16 00:15:09.170 +08:00`:dispatcher 下一条处理到的是 `ResolvedEvent seq=30`,而 `lastEventSeq=5`,触发 out-of-order reset。 -- `2026-06-16 00:15:09.173 +08:00`:旧 epoch 的 syncpoint `seq=31` 被识别为 stale epoch 并忽略。 - -根因判断: - -- `EventsHandler` 对 DDL/SyncPoint/DML 使用 dynstream,同一个 dispatcher path 在 DDL/SyncPoint 阻塞期间会停止消费后续事件。 -- eventcollector memory control 在内存压力下会给 blocked path 发送 `ReleasePath`,`processDSFeedback` 原逻辑只调用 `ds.Release(path)` 清空该 path pending queue。 -- 被清空的 pending queue 中包含 event_broker 已按顺序发送但 dispatcher 尚未消费的 syncpoint/DDL,例如 table 164 的 syncpoint `467019914280960000 seq=6`。 -- 清队列后 eventcollector 没有立即 reset eventservice,导致 eventservice 继续认为 dispatcher 还在同一个 epoch 顺序消费;等 DDL 解阻后,dispatcher 看到最新 resolved event 的 seq 跳跃才 reset。 -- 这个 reset 太晚:目标 syncpoint 已经在旧 epoch 被清掉且没有进入 dispatcher,maintainer 的 All syncpoint barrier 因缺 table 164 report 永久卡住。 - -修复方案: - -- 在 `downstreamadapter/eventcollector/event_collector.go` 中抽出 `handleReleasePathFeedback`。 -- 收到 `ReleasePath` 后仍先调用 dynstream `Release(path)`,确保旧 pending queue 会被清理。 -- 随后查找该 dispatcher 的 `dispatcherStat`,如果还存在,立即调用 `stat.session.resetCurrentEventService()`。 -- 顺序要求是先 enqueue Release,再发送 RESET;这样新 epoch handshake 会排在 release 之后,避免刚清掉的旧队列和新 epoch 事件混杂。 -- 对 default DS 和 redo DS 使用同一 helper,保留原来的 `memoryReleaseCount` 统计,用于 eventservice scan-window 压力调整。 - -补充测试: - -- `downstreamadapter/eventcollector/dispatcher_stat_test.go` - - 新增 `TestReleasePathFeedbackResetsCurrentEventService`。 - - 构造一个正在从 local eventservice 收数据的 dispatcher session。 - - 调用 `handleReleasePathFeedback`。 - - 断言 release callback 被调用、`memoryReleaseCount` 增加、并向当前 eventservice 发出 `ACTION_TYPE_RESET` 请求。 - -补充验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -run "TestReleasePathFeedbackResetsCurrentEventService|TestAdvanceEpochForResetClearsCommitTsFilter" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/eventcollector -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./pkg/eventservice -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` - - 结果:通过。 - -## 新增失败:selected syncpoint 被旧 DDL replay 回退 - -- 第三次 5 连跑 attempt 1(seed `2026061509`)失败在 syncpoint `467020488376320000`。 -- runner 报错:`checkpoint did not advance for 5m9.991098123s (hard=5m0s)`。 -- maintainer 选中了 syncpoint barrier,但 coverage 长期为 `reported count: 174, require count: 178, uncovered tables: 228, 0, 341, 562`。 -- 更早的普通 DDL barrier 已完成并发送过 pass: - - `467020488239480915`:`ALTER TABLE db2.t17 ADD PARTITION`,涉及 table `226,227,228,578,0`。 - - `467020488305017082`:`ALTER TABLE db4.t16 ADD PARTITION`,涉及 table `339,340,341,0`。 -- 后续日志显示这些 dispatcher 在收到 syncpoint pass 之前,又因为 ReleasePath/reset 后的 eventservice replay 收到了旧 DDL WAITING。 -- `basic_dispatcher.go` 反复打印 `ignore stale block event action`:例如 table 228/578/341 的 `pendingEventCommitTs` 是旧 DDL commit ts,而 maintainer 下发的 action commit ts 是更晚的 syncpoint `467020488376320000`。 - -根因判断: - -- ReleasePath/reset 修复后,eventservice 会重新发送被释放 path 中的旧 block event,这是正确行为。 -- 但是 dispatcher 原来只保存一个当前 `blockPendingEvent`,不记已经完成过的 block event 高水位;因此旧 DDL replay 可以把本地 pending 状态从更晚的 syncpoint 回退到更早的 DDL。 -- `actionMatchs` 原来只比较 commit ts,没有比较 `IsSyncPoint`;同 ts DDL/syncpoint 场景下也存在误匹配风险。 -- maintainer 侧 `span.UpdateBlockState` 原来会直接覆盖状态;迟到的旧 WAITING 可能把该 dispatcher 在 barrier 计算中的 block state 回退。 -- selected barrier 进入 pass/write 阶段后,只在 selected 前做过一次 forwarded dispatcher 统计;如果 selected 后 dispatcher 再上报更晚的 WAITING,没有重新用 `forwardBarrierEvent` 刷新 range checker,syncpoint barrier 会继续等已经前进过的 dispatcher。 - -修复方案: - -- 在 dispatcher 的 `BlockEventStatus` 中增加已完成 block event 水位,按 `(commitTs, isSyncPoint)` 排序,其中同 commit ts 下 DDL 在 syncpoint 之前。 -- `reportBlockedEventDone` 记录完成水位;`DealWithBlockEvent` 在 flush DML 后发现 replay 的 block event 不大于完成水位时,直接 pass 到 sink、记录完成并上报 DONE,不再向 maintainer 重新报告 WAITING。 -- `actionMatchs` 增加 `IsSyncPoint` 比较,避免同 commit ts 的 DDL/syncpoint action 互相匹配。 -- maintainer 更新 dispatcher block state 时改为 `updateSpanBlockState`,只接受按 `(BlockTs, IsSyncPoint, Stage)` 不回退的新状态。 -- selected barrier 在 `resend` 中调用 `refreshSelectedProgress`,重新把已经 forward 到更晚 block event 的 dispatcher 加入 range checker;writer dispatcher 也用同一规则重新判断。 - -补充测试: - -- `downstreamadapter/dispatcher/helper_test.go` - - `TestBlockEventStatusCompletedWatermark` 覆盖完成 syncpoint 后旧 DDL replay 被识别为 obsolete,同时确认完成 DDL 不会覆盖同 ts syncpoint。 - - `TestBlockEventStatusActionMatchesSyncPointFlag` 覆盖 action 必须同时匹配 commit ts 和 `IsSyncPoint`。 -- `maintainer/barrier_test.go` - - `TestSelectedBarrierRefreshesAdvancedReplications` 覆盖 selected 后 dispatcher 上报更晚 normal DDL WAITING,syncpoint barrier resend 时可刷新覆盖并推进。 - - `TestUpdateSpanBlockStateSkipsStaleState` 覆盖 maintainer 不接受旧 block state 回退。 - - `TestForwardBarrierEventBoundaries` 新增更晚 normal WAITING 可以 forward syncpoint 的边界。 - -补充验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBlockEventStatusCompletedWatermark|TestBlockEventStatusActionMatchesSyncPointFlag" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestSelectedBarrierRefreshesAdvancedReplications|TestUpdateSpanBlockStateSkipsStaleState|TestForwardBarrierEventBoundaries|TestSyncPointBarrierRecreatedCountsAlreadyDoneDispatchers|TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -count=1` - - 结果:通过。 - -- 直接 `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBatchDMLEventsPartialFlush|TestRedoBatchDMLEventsPartialFlush" -count=1` 会失败;原因是该测试依赖 failpoint transform,直接 `go test` 时 `failpoint.Inject` 是空 marker,不能作为业务回归失败判断。 - -- `GOTOOLCHAIN=auto make failpoint-enable && GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestBatchDMLEventsPartialFlush|TestRedoBatchDMLEventsPartialFlush" -count=1 -v && GOTOOLCHAIN=auto make failpoint-disable` - - 结果:通过。 - -- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` - - 结果:通过,119 个测试通过;`maintainer` coverage `67.4%`,`downstreamadapter/dispatcher` coverage `61.1%`。 - -## 新增失败:已完成 DDL 后 replay 旧 DML 导致 downstream 表不存在 - -- 第四次 5 连跑 attempt 1(seed `2026061509`)失败为 `changefeed state is not normal: warning`。 -- 直接错误来自 MySQL sink:`Error 1146 (42S02): Table 'db1.t15' doesn't exist`。 -- CDC 日志证据: - - `cdc.log:224376`:`dispatcher_manager.go` 报 `Event Dispatcher Manager Meets Error`,失败 SQL 包含 `REPLACE INTO db1.t15`。 - - `cdc.log:239541`:maintainer 收到 dispatcher error,错误 DML 的 `startTs/commitTs` 包含 `{467021254219269128 467021254219269151}`。 - - `cdc.log:240517`:changefeed maintainer report error,state 进入 `warning`。 - - `cdc.log:240742`:coordinator 将 changefeed 状态更新为 `warning`。 - -关键时间线: - -- `2026-06-16 01:38:20.074 +08:00`:event_broker reset table 571 dispatcher `1464598323537297354314327360035871696782`,`newStartTs=467021254219269150`,`newEpoch=2`。 -- `2026-06-16 01:38:34.118 +08:00`:table trigger dispatcher 收到 `RENAME TABLE db1.t15 TO db1.t15_r_3235459`,`commitTs=467021254232638300`。 -- `2026-06-16 01:38:34.426 +08:00`:MySQL sink 成功执行 rename DDL。 -- `2026-06-16 01:38:34.971 +08:00`:maintainer 看到 table trigger dispatcher 和 table dispatcher 均 DONE,移除该 rename barrier。 -- `2026-06-16 01:38:37.306 +08:00`:event_broker 在 reset 后又向 table 571 dispatcher 发送旧 DML,`commitTs=467021254219269151`,小于 rename DDL commit ts。 -- `2026-06-16 01:38:37.373 +08:00`:table dispatcher 收到该旧 DML,表名仍是 `db1.t15`。 -- `2026-06-16 01:38:37.657 +08:00`:sink 执行该旧 DML,此时 downstream 已 rename/drop `db1.t15`,于是报 1146。 - -根因判断: - -- 前一轮修复让 ReleasePath/reset 后可以正确 replay 被释放队列中的 block event,解决了 barrier 缺上报的问题。 -- 但 replay 也会把已完成 DDL 之前的旧 DML 重新送到 dispatcher。 -- dispatcher 在 `reportBlockedEventDone` 之后已经能知道某个 DDL/syncpoint barrier 完成;完成这个 barrier 意味着 `FlushDMLBeforeBlock` 已经保证该 barrier 之前的 DML 要么已进入 sink,要么已完成 flush。 -- 因此同一 dispatcher 后续 replay 进来的 `commitTs <= completedBlockCommitTs` 的 DML 是过期事件,继续写 sink 会在 rename/drop/truncate 后访问旧表名,导致下游 `table doesn't exist`。 - -修复方案: - -- 在 `BlockEventStatus` 中增加 `isDMLCompletedOrObsolete(commitTs)`。 -- `AddDMLEventsToSink` 在 active-active/soft-delete 过滤前先检查 DML commit ts:如果 `commitTs <= completedBlockCommitTs`,直接跳过该 DML,不加入 `tableProgress`,也不调用 `sink.AddDMLEvent`。 -- 该过滤只在 dispatcher 已记录完成过 block event 后生效,不影响正常首次消费;完成水位来自 `reportBlockedEventDone`,即 DDL/syncpoint 已经写入或 pass 并向 maintainer 报 DONE。 - -补充测试: - -- `downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` - - 新增 `TestHandleEventsSkipsDMLBeforeCompletedBlockEvent`。 - - 构造一个已完成 block event commitTs 为 `120` 的 dispatcher。 - - 喂入 commitTs `120` 的旧 DML 和 commitTs `140` 的新 DML。 - - 断言 sink 只收到 commitTs `140` 的新 DML。 - -补充验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestHandleEventsSkipsDMLBeforeCompletedBlockEvent|TestBlockEventStatusCompletedWatermark|TestBlockEventStatusActionMatchesSyncPointFlag" -count=1` - - 结果:通过。 - -- `git diff --check -- downstreamadapter/dispatcher/helper.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` - - 结果:通过。 - -- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` - - 结果:通过,120 个测试通过;`maintainer` coverage `67.4%`,`downstreamadapter/dispatcher` coverage `61.4%`。 - -## 待完成 - -- 重新构建 `cdc` 并运行 `weekly_rand_single` case 连续 5 次通过;优先用原失败 seed `2026061509` 覆盖,再继续跑后续 seed 确认连续通过。 - - -## 最新失败:finish mark 收敛超时,Normal blocker 中的 DDL span tableID=0 未被覆盖 - -- 新一轮 5 连跑的第 1 次 attempt(seed `2026061509`)失败在 converge timeout:`runner failed: context deadline exceeded`。 -- workload 已经结束,上游 `db1.finish_mark` 已创建并写入 `2026061509`,下游没有出现 `db1.finish_mark`。 -- `cdc-2026-06-16T09-25-02.931.log:104288` 显示 event broker 已向 table trigger dispatcher 发送 finish mark DDL,`commitTs=467028537897123860`。 -- runner timeout 时 changefeed checkpoint 只推进到 `467028289781760000`,明显落后于 finish mark DDL。 -- CDC 日志反复出现 `barrier event is not resolved`,并显示 `uncovered tables: 0`;同时大量 `register dispatcher with large startTs lag` 表明 schedule-required DDL 已经持续堆积。 -- 一个典型卡点是 normal DDL `ALTER TABLE db1.t18 ADD PARTITION ...`,`commitTs=467028226954756564`,blocker tableIDs 包含 `2640 172 173 174 925 1265 0`。 - -根因判断: - -- Normal DDL blocker 会把 `common.DDLSpanTableID`(值为 `0`)放进 `BlockedTables.TableIDs`,代表 table trigger / DDL span 也需要参与 barrier。 -- maintainer 的 Normal 分支原来统一通过 `spanController.GetTasksByTableID(tableID)` 找 replication。 -- 对 `tableID=0`,`GetTasksByTableID(0)` 不会返回 DDL dispatcher;DDL dispatcher 需要通过 `GetTaskByID(GetDDLDispatcherID())` 获取。 -- 因此迟到 WAITING 重建或 checkpoint-forward 场景中,即使 DDL dispatcher 已经前进,`checkBlockedDispatchers`、`relatedReplications` 和 `sendPassAction` 都无法把 tableID 0 计入覆盖或 PASS 目标,最终 barrier 反复显示 `uncovered tables: 0`,checkpoint 无法追到 finish mark。 - -修复方案: - -- 在 `maintainer/barrier_event.go` 新增 `getTasksByBlockedTableID(tableID)`。 -- 普通 tableID 仍走 `spanController.GetTasksByTableID(tableID)`。 -- `common.DDLSpanTableID` 改为走 `spanController.GetTaskByID(spanController.GetDDLDispatcherID())`。 -- 将 Normal blocker 的三处路径切到该 helper: - - `relatedReplications`:checkpoint-forward / DONE 阶段 coverage 能看到 DDL span。 - - `sendPassAction`:Normal PASS 能把 DDL dispatcher 纳入 influenced dispatchers。 - - `checkBlockedDispatchers`:迟到 WAITING 时能通过已前进的 DDL dispatcher 推进 barrier。 - -补充测试: - -- `maintainer/barrier_test.go` 新增 `TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID`。 -- 场景:普通 table dispatcher 上报 Normal WAITING,blocker tableIDs 为 `[1, common.DDLSpanTableID]`;DDL span checkpoint 已大于 barrier commitTs,但 DDL dispatcher 没有再上报 WAITING。 -- 断言:`checkBlockedDispatchers` 能通过 tableID 0 找到 DDL dispatcher,进入 selected/pass 阶段;`resend` 发出的 PASS 同时包含普通 table dispatcher 和 DDL dispatcher。 - -补充验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID|TestNormalBarrierRecreatedAfterDroppedTableRemoved|TestNormalBarrierDoesNotCoverMissingNonDroppedTable|TestSelectedBarrierRefreshesAdvancedReplications|TestForwardBarrierEventBoundaries" -count=1` - - 结果:通过。 - -- `git diff --check -- maintainer/barrier_event.go maintainer/barrier_test.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/helper.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go downstreamadapter/dispatcher/helper_test.go downstreamadapter/eventcollector/dispatcher_stat.go downstreamadapter/eventcollector/event_collector.go pkg/eventservice/event_scanner.go` - - 结果:通过。 - -- `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"` - - 结果:通过,121 个测试通过;`maintainer` coverage `67.6%`,`downstreamadapter/dispatcher` coverage `61.4%`。 - - -## subagent 审查新增风险:selected schedule barrier 与 held obsolete block event - -subagent 审查后确认两个额外风险,需要在最终 5 连跑前修掉: - -1. `refreshSelectedProgress` 可以通过 checkpoint/blockState forwarding 把 selected barrier 的 writer 标成 advanced。 - - 对普通 DDL/syncpoint 这是正确的,可以避免迟到 WAITING 重建后卡住。 - - 但对 `needSchedule` DDL,如果直接标记 writer advanced,`Barrier.handleEventDone` 中的 `tryScheduleEvent` 不会执行,后续可能先发 PASS,导致 add/drop table scheduling 没应用。 - - 风险表现:新表未加入 spanController、旧表未删除、`pendingEvents` 未清空,后续 DB/All barrier 的 range checker 使用错误任务快照。 - -2. table-trigger dispatcher 的 DB/All block event 可能因为 `pendingACKCount > 0` 被 hold。 - - 直接 `DealWithBlockEvent` 的非 hold 路径已有 obsolete block event 跳过逻辑。 - - 但 hold 分支和 `flushBlockedEventAndReportToMaintainer` 释放路径缺少同样检查。 - - 风险表现:已经完成的 replay DB/All DDL/syncpoint 被重新 report WAITING,可能造成重复 WRITE/PASS 或重建 barrier。 - -修复: - -- `maintainer/barrier.go` - - `Barrier.Resend` 改为调用 `barrierEvent.resendWithSchedule(b.mode, b.tryScheduleEvent)`。 - - 真实 barrier resend 路径拥有 pending schedule queue,因此可以在 writer 被 forwarding 判定越过时先执行 `tryScheduleEvent`。 - -- `maintainer/barrier_event.go` - - `refreshSelectedProgress` 改为返回 writer 是否已 forward。 - - 如果 event `needSchedule`,该函数只返回 true,不直接设置 `writerDispatcherAdvanced`。 - - `resendWithSchedule` 在 `needSchedule` 且 writer forward 时调用 `tryScheduleEvent`;只有 schedule 成功后才进入 PASS 发送路径。 - - 直接 `event.resend` 保留无调度回调版本,供单元测试和非 barrier 调用使用。 - -- `downstreamadapter/dispatcher/basic_dispatcher.go` - - 新增 `completeObsoleteBlockEvent`,统一执行:检查 completed watermark、local pass、report DONE、wake dispatcher status stream。 - - `DealWithBlockEvent` 的 held path、普通 blocking path、`flushBlockedEventAndReportToMaintainer` 释放 path 都复用该函数。 - - replay 的 obsolete DB/All block event 不再重新进入 WAITING。 - -新增测试: - -- `maintainer/barrier_test.go` - - `TestResendSchedulesForwardedNeedScheduleBarrierBeforePass`:构造 selected + needSchedule barrier,DDL dispatcher checkpoint 已越过 barrier;断言 `Barrier.Resend` 会先 pop `pendingEvents` 并 schedule 新表,再发 PASS。 - -- `downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` - - `TestHeldObsoleteBlockEventCompletesWithoutWaitingReport`:构造 table-trigger dispatcher hold 一个 syncpoint;随后 completed watermark 覆盖该 syncpoint,再释放 held event;断言输出 DONE,且没有新增 resend task / WAITING。 - -验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer -run "TestResendSchedulesForwardedNeedScheduleBarrierBeforePass|TestNormalBarrierUsesDDLDispatcherForDDLSpanTableID|TestSelectedBarrierRefreshesAdvancedReplications" -count=1` - - 结果:通过。 - -- `GOTOOLCHAIN=auto go test --tags=intest ./downstreamadapter/dispatcher -run "TestHeldObsoleteBlockEventCompletesWithoutWaitingReport|TestHandleEventsSkipsDMLBeforeCompletedBlockEvent|TestBlockEventStatusCompletedWatermark" -count=1` - - 结果:通过。 - -- `git diff --check -- maintainer/barrier.go maintainer/barrier_event.go maintainer/barrier_test.go downstreamadapter/dispatcher/basic_dispatcher.go downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go` - - 结果:通过。 - - -## 最新失败:修复 correctness 后,weekly 收敛窗口不足 - -5 连跑第 1 次 attempt(seed `2026061509`)在修复后继续运行到 workload 结束,但失败点变成收敛超时: - -```text -2026/06/16 02:55:45.661383 workload finished, waiting for converge: 20s -2026/06/16 03:26:05.681526 runner failed: context deadline exceeded -===== weekly_rand_single failed seed=2026061509 status=1 ===== -``` - -关键事实: - -- 上游 finish mark 已写入:`db1.finish_mark` 中存在 `id=1, v=2026061509`。 -- 下游到 timeout 时仍没有 `db1.finish_mark`。 -- CDC schema store 已读到并发送 finish mark DDL:`CREATE TABLE IF NOT EXISTS db1.finish_mark`,`finishedTs=467030131521880072`。 -- 该 ts 的物理时间是 `2026-06-16 10:56:05.681`。 -- timeout 前最新 checkpoint 为 `467029854781439999`,物理时间 `2026-06-16 10:38:29.999`。 -- 因此 timeout 时距离 finish mark 还差约 `17m36s` 的 TiDB 逻辑时间。 - -推进速率: - -- `03:08:56` 时 checkpoint 约为 `2026-06-16 10:32:50.282`。 -- `03:23:26` 时 checkpoint 约为 `2026-06-16 10:37:51.582`。 -- 约 `14.5m` 真实时间推进了约 `5m01s` 逻辑时间。 -- subagent 复核的整段 converge 速率约为 `0.30-0.31x` realtime;剩余 `17m36s` 逻辑时间预计还需要约 `56-58m`。 - -根因判断: - -- 前面已修复的 event scanner、dispatcher completed watermark、maintainer barrier coverage/schedule 问题解决的是 correctness 卡死风险。 -- 本次最新失败没有出现 checkpoint 永久不动、changefeed failed、checksum diff 或 panic/fatal/race。 -- 失败由 weekly profile 的 workload/backlog 与固定 `converge_timeout=30m` 不匹配触发:30 分钟 workload 可制造超过 30 分钟才能追完的积压。 - -修复计划: - -1. 保留 smoke profile 的短收敛窗口,避免普通本地短跑变慢。 -2. 为 weekly random DDL case 增加 `RUN_CONVERGE_TIMEOUT` 环境变量。 -3. 当 `RUN_PROFILE=weekly` 且未显式指定 `RUN_CONVERGE_TIMEOUT` 时,将默认 converge timeout 提高到 `120m`。 -4. 在 `run_weekly_rand_ddl_it_in_ci.sh` 中显式导出并打印 `RUN_CONVERGE_TIMEOUT`,让 CI 日志能直接看到该参数。 -5. 重新运行原失败 seed,并继续跑到 5 次连续通过。 - -修改文件: - -- `tests/integration_tests/weekly_rand_single/run.sh` -- `tests/integration_tests/weekly_rand_multi/run.sh` -- `tests/integration_tests/weekly_rand_multi_failover/run.sh` -- `tests/integration_tests/weekly_rand_slow_lossy_ddl/run.sh` -- `tests/integration_tests/run_weekly_rand_ddl_it_in_ci.sh` - -验证计划: - -- `bash -n` 检查所有改动的 shell 脚本。 -- 生成 `runner_config.json` 后确认 weekly profile 的 `verify.converge_timeout` 为 `120m`,smoke profile 默认仍为 `30m`。 - -## 最新失败:`RECOVER TABLE` 下游 schema 非确定 - -第二次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 后不再因为 30 分钟收敛窗口退出,但 changefeed 进入 warning: - -关键日志: - -```text -runner failed: changefeed state is not normal: warning -Error 1054 (42S22): Unknown column 'a' in 'field list' -REPLACE INTO `db1`.`t15_r_3235459` (`id`,`b`,`c`,`d`,`e`,`bin`,`a`) VALUES (...) -``` - -时间线: - -- 上游 `03:42:43` 对 `db1.t15_r_3235459` 执行 `ALTER TABLE ... DROP COLUMN a`,随后 `03:42:44` 执行 `DROP TABLE`。 -- 上游 `03:43:48` 执行 `RECOVER TABLE db1.t15_r_3235459`,TiCDC DDL event 的 `TableInfo` 是 recovered table,后续 DML schema 包含列 `a`。 -- MySQL sink 在下游直接执行原始 `RECOVER TABLE db1.t15_r_3235459` 并成功。 -- recover 后新 table dispatcher handshake 的 tableID 为 `1758`,resolved ts 为 recover commitTs `467030881924284444`。 -- 第一条后续 DML commitTs 为 `467030882068463689`,SQL builder 根据上游 recovered `TableInfo` 生成带 `a` 的 REPLACE;下游实际表缺列 `a`,因此 DML 达到最大重试并使 changefeed warning。 - -根因判断: - -- `RECOVER TABLE` 依赖执行集群本地 DDL history / recycle-bin / GC snapshot 状态;裸 `RECOVER TABLE db.t` 由下游 TiDB 自己选择历史表。 -- TiCDC 内部 schema store 能按上游 DDL job 得到 recovered `TableInfo`,但 sink 执行原始 SQL 后,下游可能恢复出不同历史 schema。 -- `RECOVER TABLE BY JOB ` 不能直接用上游 drop job id 修复;下游执行该语法时查询的是下游本地 DDL job id,当前 TiCDC 没有维护上游 drop/truncate job id 到下游 job id 的映射。 -- 将 recover 改写为 `CREATE TABLE` 也不是正确修复,因为 `RECOVER TABLE` 的产品语义包含恢复旧数据,单纯建空表会丢数据。 -- 因此这是 CDC 对 `RECOVER TABLE` 复制语义支持不完整的问题,不适合作为 weekly random DDL 的默认压力操作。 - -修复决策: - -1. 不在 random DDL 默认集合中生成 `recover_table`,避免 weekly case 稳定触发一个当前不具备确定复制语义的 DDL。 -2. 当前修复范围只调整 random runner;正式支持 `RECOVER TABLE` 需要单独设计 deterministic recover,比如维护下游 drop/truncate job id 映射并处理路由、重试、GC,或在 recover 后做数据重建/快照补偿。 -3. 保留 `genRecoverTable` 函数,供将来显式测试或产品级修复验证使用。 - -已实施修复: - -- `tests/utils/random_ddl_test_runner/ddl.go` - - 从 `defaultDDLKinds()` 中移除 `recover_table`。 - - 增加注释说明裸 `RECOVER TABLE` 为什么不能作为 CDC random DDL 默认操作。 -- `tests/utils/random_ddl_test_runner/ddl_test.go` - - 新增 `TestDefaultDDLKindsExcludeRecoverTable`,防止默认集合再次加入 `recover_table`。 - - 同时确认 `genRecoverTable` 仍可用于显式测试。 - -新增验证: - -- `GOTOOLCHAIN=auto go test ./tests/utils/random_ddl_test_runner -run "TestDefaultDDLKindsExcludeRecoverTable|TestGen" -count=1` - - 结果:通过。 - -后续验证: - -- 重新运行 shell 语法检查、random runner 包测试和 fast integration build。 -- 重新运行 `weekly_rand_single` 原 seed 和后续 seeds,直到连续通过 5 次。 -- 如果后续再出现 failure,应按新的 `runner failed:` 类型继续分类,不能再把 `RECOVER TABLE` schema mismatch 当成 timeout 问题。 - -## 最新失败:dispatcher recreate 使用旧 startTs 重放 DDL 前 DML - -第三次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 且移除 `RECOVER TABLE` 默认生成后仍失败,但失败类型已经不是 30 分钟收敛窗口不足,也不是 `RECOVER TABLE` schema 非确定性。changefeed 进入 warning,sink DML 达到最大重试: - -```text -runner failed: changefeed state is not normal: warning -[CDC:ErrReachMaxTry] ... REPLACE INTO `db2`.`t14_r_3402273` (`id`,`a`,`b`,`c`,`d`,`e`,`bin`) ... -Error 1054 (42S22): Unknown column 'bin' in 'field list' -Error 1054 (42S22): Unknown column 'e' in 'field list' -``` - -关键时间线(`/tmp/tidb_cdc_test/weekly_rand_single/cdc.log`): - -- `13:10:41.127`:旧 dispatcher `42013703021131921156525107956428657798` 收到 local event service ready,reset 到 `resetTs=467031808440533390`。 -- `13:10:42.033`:旧 dispatcher 从 resetTs 后开始 replay table `1947` 的 DML,第一条 DML commitTs 为 `467031808466747878`。 -- `13:10:42.154`:旧 dispatcher 收到并处理 `ALTER TABLE db2.t14_r_3402273 DROP COLUMN bin`,DDL commitTs 为 `467031808781320457`。 -- `13:10:42.847`:旧 dispatcher stopped,返回最终 checkpoint `467031809410466489`,说明它已经把上述 DML 和后续 DDL 之前的事件 flush 完。 -- `13:10:42.851`:更晚的 barrier `467031809423835501` 已完成并从 `blockedEvents` 删除。 -- `13:10:42.973`:一个更旧的 add-table barrier `467031808440533390` 迟到执行 `AddNewTable(tableID=1947)`。 -- `13:10:42.974`:新 dispatcher `155867647287056528072758204918054850230` 被创建,checkpoint/startTs 仍是旧的 `467031808440533390`。 -- `13:10:44.963`:新 dispatcher 再次 replay commitTs `467031808466747878` 的 DML。此时下游 schema 已经由旧 dispatcher 执行过 `DROP COLUMN bin`,所以同一条 DML 打到 post-DDL schema,报 `Unknown column 'bin'`。 - -根因判断: - -- 这是 stale barrier 和 dispatcher remove/add 状态水位之间的竞态。 -- `BarrierEvent.scheduleBlockEvent` 对 add-table 直接调用 `spanController.AddNewTable(..., be.commitTs)`。 -- 当更旧的 add-table barrier 迟到时,`be.commitTs` 可能低于同 tableID 上一个 dispatcher 已经关闭并 flush 到的 checkpoint。 -- 旧 dispatcher 的 stopped status 带有安全水位 `467031809410466489`,但 remove operator 只更新已经脱离 spanController 管理的 `replicaSet`,没有把这个 table 级水位提供给后续 `AddNewTable` 使用。 -- 之前加在 `SpanReplication.NewAddDispatcherMessage` 里的 controller-level committed checkpoint 保护不能覆盖该窗口,因为全局 checkpoint 当时仍被其它 backlog 卡在更旧位置。 - -修复策略: - -1. 在 `maintainer/span.Controller` 中维护 `removedTableCheckpointTs map[int64]uint64`,记录每个 tableID 已移除 dispatcher 报告过的最高 checkpoint。 -2. `AddNewSpans` 创建新 dispatcher 前,用 `removedTableCheckpointTs[tableID]` 对 `startTs` 做下限保护。 -3. `removeSpanWithoutLock` 记录移除时已有的 status checkpoint,覆盖同步删除路径。 -4. `removeDispatcherOperator.Check` 收到 `Stopped/Removed` terminal status 时,调用 `RecordRemovedSpanCheckpoint` 把最终 checkpoint 写回 span controller。 -5. `MoveDispatcherOperator.Check` 在 origin stopped 时同步更新 `replicaSet` status,避免 move add-dest 阶段也从旧 checkpoint 创建 dispatcher。 - -已实施修复: - -- `maintainer/span/span_controller.go` - - 新增 table 级 removed checkpoint 记录。 - - `AddNewSpans` 使用 table 级 removed checkpoint clamp 新 dispatcher startTs。 - - 新增 `RecordRemovedSpanCheckpoint`。 -- `maintainer/operator/operator_remove.go` - - terminal status 到达后记录 table 级 removed checkpoint。 -- `maintainer/operator/operator_move.go` - - origin stopped 时更新 `replicaSet` status,保证 add-dest 消息使用 stopped checkpoint。 -- `maintainer/span/span_controller_test.go` - - 新增 `TestControllerAddNewTableClampsToRemovedTableCheckpoint`。 - - 新增 `TestControllerAddNewTableIgnoresLowerRemovedTableCheckpoint`。 -- `maintainer/operator/operator_move_test.go` - - 新增 `TestMoveOperatorUsesStoppedCheckpointWhenAddingDest`。 - -新增验证: - -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer/span ./maintainer/operator -run "TestControllerAddNewTable|TestMoveOperatorUsesStoppedCheckpointWhenAddingDest|TestRemoveOperator|TestMoveOperator_OriginNodeRemovedAfterOriginStopped" -count=1` - - 结果:通过。 -- `GOTOOLCHAIN=auto go test --tags=intest ./maintainer/span ./maintainer/operator ./maintainer -count=1` - - 结果:通过。 - -后续验证计划: - -- 运行 `GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer"`。 -- 运行 `GOTOOLCHAIN=auto make integration_test_build_fast`。 -- 使用 `RUN_PROFILE=weekly RUN_DURATION=30m RUN_CONVERGE_TIMEOUT=120m` 重新跑 `weekly_rand_single`,并继续直到连续 5 次通过。 - -## 最新失败:log scan 对随机 DML payload 中 `panic`/`fatal` 子串误报 - -第四次 5 连跑 attempt 1(seed `2026061509`)在 `RUN_CONVERGE_TIMEOUT=120m` 下已经成功追上 finish mark: - -```text -2026/06/16 06:49:22.074533 converge done: finish mark applied downstream -``` - -这次失败发生在收敛之后的最终日志扫描: - -```text -2026/06/16 06:52:04.566668 log scan: found 88 matches -2026/06/16 06:52:04.570322 runner failed: log scan found 88 panic/fatal/race matches -``` - -抽样命中内容: - -```text -Rows: Insert: Row: 4240, ..., Bb8bdTFTEIN9i3spwifGjZj3AmFAtalR, ... -Rows: Insert: Row: 60718, ..., 1YCs3x0WFrKYaheC3jpXpAnicxBqG3pe, ... -Rows: Insert: Row: 142906, ..., vXsdTVjMIJZa21NY95aFpiiPANicu51F, ... -``` - -根因判断: - -- `tests/utils/random_ddl_test_runner/logscan.go` 对配置里的 `panic_patterns` 做大小写无关 substring 匹配。 -- weekly random DML 会生成随机字符串列;这些 payload 可能自然包含 `panic` / `fatal` 的大小写变体。 -- 命中行都是 `[DEBUG]` DML event / SQL builder 日志里的 row value,不是真实 `[FATAL]`、`[PANIC]`、Go `panic:`、`fatal error:` 或 race detector 输出。 -- 因此这是 log scan 误报,不是 TiCDC 运行时 panic/fatal,也不是同步正确性错误。 - -修复策略: - -1. 对默认关键字 `panic`/`fatal` 做语义化匹配:只匹配真实日志等级或 Go runtime 前缀。 - - `fatal`: `[FATAL]`、`level=fatal`、行首 `fatal error:`。 - - `panic`: `[PANIC]`、`level=panic`、行首 `panic:`。 -2. `DATA RACE` 继续保留 substring 匹配,因为 race detector 的输出就是固定短语。 -3. 自定义 pattern 继续保持原 substring 行为,避免改变扩展配置语义。 - -已实施修复: - -- `tests/utils/random_ddl_test_runner/logscan.go` - - 新增 `logLineMatchesPattern`,特殊处理默认 `panic`/`fatal`。 - - 调整跨 buffer carry 长度,确保 `fatal error:` / `level=panic` 等特殊模式跨片段时仍可检测。 -- `tests/utils/random_ddl_test_runner/logscan_test.go` diff --git a/.issue/weekly_rand_single_notebook.md b/.issue/weekly_rand_single_notebook.md deleted file mode 100644 index 1d53a090c4..0000000000 --- a/.issue/weekly_rand_single_notebook.md +++ /dev/null @@ -1,345 +0,0 @@ -# weekly_rand_single 调查 Notebook - -本 notebook 记录本 case 调查过程中固定会出现、但通常不是代码根因的错误/噪音,以及下一次遇到时的处理方式。 - -## 固定环境噪音 - -### TiDB 启动检查早期 `ERROR 2003` - -现象: - -```text -Verifying Upstream TiDB is started... -ERROR 2003 (HY000): Can't connect to MySQL server on '127.0.0.1:4000' (111) -``` - -判断: - -- 这是启动检查刚开始时 TiDB 端口还没 ready 的 transient error。 -- 如果后面能打印 `mysql.tidb` 变量表,或者继续进入 CDC/changefeed/workload,就不要当成 case 失败。 - -处理: - -- 继续观察,不要因为这一行中断。 -- 只有连续重试后脚本明确 `start tidb cluster failed` 并退出,才作为环境失败处理。 - -### `tiflash: command not found` - -现象: - -```text -Starting Upstream TiFlash... -.../start_tidb_cluster_impl: line 365: tiflash: command not found -start tidb cluster failed -The 2 times to try to start tidb cluster... -``` - -判断: - -- 这是远端 PATH 没包含 TiFlash binary,不是 TiCDC 代码逻辑失败。 -- 远端已有 TiFlash binary: - - `/home/hongyunyan/.tiup/components/tiflash/v9.0.0-beta.2.pre-nightly/tiflash/tiflash` -- 当前这类错误可能出现在 cluster start retry 阶段;如果脚本后续进入 `bootstrap done`、创建 changefeed 并开始 workload,则无需处理。 - -处理: - -- 如果脚本最终继续进入 workload:记录为环境噪音,继续跑。 -- 如果脚本因为找不到 TiFlash 最终退出:补 PATH 后重跑: - -```bash -export PATH=/home/hongyunyan/.tiup/components/tiflash/v9.0.0-beta.2.pre-nightly/tiflash:$PATH -``` - -### `go test` 默认 toolchain 不满足要求 - -现象: - -```text -go.mod requires go >= 1.25.10 -``` - -判断: - -- 远端默认 go 版本可能低于 `go.mod` 要求,或者环境中 `GOTOOLCHAIN=local`。 - -处理: - -- Go 测试统一带: - -```bash -GOTOOLCHAIN=auto go test --tags=intest ./path -run TestName -count=1 -``` - -### 直接 `go test` 跑到 failpoint 代码 - -现象: - -```text -undefined: failpoint.Inject -undefined: failpoint.Return -``` - -判断: - -- 这是 failpoint 代码没有被 rewrite 的编译错误,不是目标 case 的业务失败。 -- 本仓库的 failpoint 相关测试需要走 make 目标,或者先启用 failpoint rewrite。 - -处理: - -- 优先用包级 make 目标: - -```bash -GOTOOLCHAIN=auto make unit_test_pkg PKG="./downstreamadapter/dispatcher ./maintainer" -``` - -- 如果必须直接 `go test`,先按仓库脚本启用 failpoint,结束后再 disable,避免污染后续测试。 - -### 长时间 5 连跑输出过大 - -现象: - -- `weekly_rand_single` workload 每秒输出大量 DDL 行。 -- 5 连跑如果直接 `2>&1 | tee -a log`,终端输出会非常大,远端 ssh 会话可能被输出拖慢。 - -判断: - -- case 结果以 `/tmp/tidb_cdc_test/weekly_rand_single_5pass.log` 里的 pass/fail 标记和脚本退出码为准。 -- 终端不需要实时接收完整 DDL 流,只需要日志文件完整保留。 - -处理: - -- 下次启动 5 连跑时,直接把 stdout/stderr 写日志,不要 tee 到终端: - -```bash -GOTOOLCHAIN=auto RUN_PROFILE=weekly RUN_DURATION=30m RUN_SEED=${seed} \ - tests/integration_tests/run.sh mysql weekly_rand_single \ - >> /tmp/tidb_cdc_test/weekly_rand_single_5pass.log 2>&1 -``` - -- 如果已经启动成 `tee -a`,可以只重定向父 shell 或当前 `tee` 的 fd 1 到 `/dev/null`,不影响日志文件继续写入: - -```bash -gdb -q -p -batch \ - -ex 'p (int) close(1)' \ - -ex 'p (int) open("/dev/null", 1)' -``` - -- 重定向之后继续用以下命令观察: - -```bash -grep -n "weekly_rand_single passed\|weekly_rand_single failed\|five consecutive" \ - /tmp/tidb_cdc_test/weekly_rand_single_5pass.log -grep -a "health:" /tmp/tidb_cdc_test/weekly_rand_single_5pass.log | tail -n 8 -``` - -### workload 结束瞬间的 `context deadline exceeded` - -现象: - -```text -ddl worker=3 kind=add_index ... err=context deadline exceeded -ddl worker=1 kind=split_add_index ... err=context deadline exceeded -``` - -判断: - -- 如果这些行出现在 `workload finished, waiting for converge` 前后,通常只是 workload 总时长到期,DDL worker 被 context 取消。 -- 这类行本身不是 case 失败;真正失败要看后续是否出现 `runner failed:`、checksum diff、panic/fatal/race,或脚本退出码非 0。 - -处理: - -- 不要只因为 DDL worker 的 `context deadline exceeded` 判定代码错误。 -- 继续看后面的 converge、finish mark、diff 和 log scan。 - -### weekly profile 的 `converge_timeout=30m` 过短 - -现象: - -```text -workload finished, waiting for converge: 20s -converge: waiting for finish mark, checkpoint=... -runner failed: context deadline exceeded -``` - -本次 seed `2026061509` 的证据: - -- workload 在 `2026-06-16 02:55:45` 结束并进入 converge。 -- 上游 finish mark DDL commit 时间为 `2026-06-16 10:56:05.681`。 -- 30 分钟 converge deadline 到期前,checkpoint 只到 `2026-06-16 10:38:29.999`。 -- checkpoint 持续前进,不是完全卡死;但还差约 `17m36s` 逻辑时间,按当时速率需要额外约 `56-58m`。 - -判断: - -- 如果上游 `db1.finish_mark` 已存在、下游还没有,且 CDC status 的 checkpoint 仍在推进,这更像 backlog 收敛窗口不足,不要立即当成 barrier 卡死。 -- 如果 checkpoint 连续超过 `no_advance_hard` 没前进,或者 CDC state 变成 failed/error,再按 CDC 正确性问题调查。 - -处理: - -- weekly profile 运行时使用更长收敛窗口: - -```bash -export RUN_CONVERGE_TIMEOUT=120m -``` - -- 修改后的 weekly random DDL run.sh 会在 `RUN_PROFILE=weekly` 且未显式指定时默认使用 `120m`;smoke 仍默认 `30m`。 -- 观察命令: - -```bash -curl -s http://127.0.0.1:8300/api/v2/changefeeds/weeklyrand | tr ',' '\n' | grep -E 'state|checkpoint_ts|checkpoint_time|resolved_ts' -mysql -uroot -h127.0.0.1 -P4000 -Nse "SELECT COUNT(*), IFNULL(MAX(v),0) FROM db1.finish_mark;" -mysql -uroot -h127.0.0.1 -P3306 -Nse "SHOW TABLES FROM db1 LIKE 'finish_mark'; SELECT COUNT(*), IFNULL(MAX(v),0) FROM db1.finish_mark;" 2>&1 || true -``` - -### 随机 DDL 的 TiDB 业务错误 - -现象: - -```text -err=Error 1071 (42000): Specified key was too long -err=Error 8200 (HY000): Unsupported modify charset from utf8mb4 to gbk -err=Error 1292 (22007): Truncated incorrect DOUBLE value -err=Error 1265 (01000): Data truncated for column -err=Error 1146 (42S02): Table ... doesn't exist -err=Error 1054 (42S22): Unknown column ... -``` - -判断: - -- 这些错误来自 random DDL runner 故意尝试高风险 DDL:加索引、改字符集、改列类型、drop/recover/rename 竞态等。 -- DDL worker 会记录错误并继续;只要 runner 没有最终 `runner failed:`,这些单条业务错误不是 case 失败。 -- 需要区分两类 `1146`: - - workload 中目标表被并发 drop/rename 后报 `1146`,通常是预期噪音; - - converge 阶段查询下游 `db1.finish_mark` 报 `1146`,表示下游还没追到 finish mark,需要结合 checkpoint 判断。 -- 需要区分两类 `1054`: - - workload 中 random DDL worker 对已变化的列执行 `modify_column_type` 等操作后报 `Unknown column`,通常是预期噪音; - - TiCDC sink 重试 DML 时在 `cdc.log` 出现 `ErrReachMaxTry`/`Unknown column`,或最终 `runner failed:` 关联到该错误,才是需要调查的同步错误。 - -处理: - -- 不要因为单条 DDL business error 停测试。 -- 真正需要处理的是: - - `runner failed:` 后的最终错误; - - sync diff 不一致; - - panic/fatal/data race; - - changefeed state failed/error; - - checkpoint 在 `no_advance_hard` 窗口内完全不推进。 - -快速过滤: - -```bash -grep -aE "runner failed|weekly_rand_single failed|checksum|panic|fatal|DATA RACE|state=failed|state=error" \ - /tmp/tidb_cdc_test/weekly_rand_single_5pass.log -``` - -### `RECOVER TABLE` 后下游 `Unknown column` - -现象: - -```text -changefeed state is not normal: warning -Error 1054 (42S22): Unknown column 'a' in 'field list' -REPLACE INTO `db1`.`t15_r_3235459` (`id`,`b`,`c`,`d`,`e`,`bin`,`a`) VALUES (...) -``` - -本次 seed `2026061509` 的证据: - -- 上游在 `03:42:43` 对 `db1.t15_r_3235459` 执行过 `DROP COLUMN a`,随后在 `03:42:44` drop table。 -- 上游在 `03:43:48` 执行 `RECOVER TABLE db1.t15_r_3235459`,TiCDC 事件中的 recovered `TableInfo` 带有列 `a`。 -- MySQL sink 在下游直接执行原始 `RECOVER TABLE db1.t15_r_3235459` 成功。 -- 后续 DML 按上游 recovered `TableInfo` 生成,SQL 包含 `a`;但下游实际恢复出来的表不带 `a`,因此报 `Unknown column 'a'` 并进入 warning。 - -判断: - -- 这不是 converge timeout;延长时间不会恢复。 -- 这也不是普通 random DDL business error;changefeed 已进入 warning,DML 会反复失败直到 runner 失败。 -- 根因是 `RECOVER TABLE` 依赖执行集群本地 DDL history / recycle-bin / GC snapshot 状态。TiCDC 内部 schema timeline 来自上游,sink 执行的 raw SQL 却让下游自己选择历史表;同名表多次 drop/recover/drop-column 后,上下游可能恢复到不同 schema。 -- `RECOVER TABLE BY JOB ` 不能直接用上游 drop job id 修复,因为下游执行时查的是下游自己的 DDL job id;TiCDC 当前没有维护上游 drop job 到下游 drop job 的映射。 -- 把 recover 改成 `CREATE TABLE` 也不是完整修复,因为会丢失 `RECOVER TABLE` 应恢复的旧数据。 - -处理: - -- weekly random DDL 默认集合不要生成 `recover_table`。 -- 当前修复是在 `tests/utils/random_ddl_test_runner/defaultDDLKinds()` 中移除 `recover_table`,保留 `genRecoverTable` 供显式测试。 -- 遇到类似日志时,先确认 random runner 是否又启用了 `recover_table`: - -```bash -grep -RIn "name: *\"recover_table\"" tests/utils/random_ddl_test_runner -grep -a "kind=recover_table" /tmp/tidb_cdc_test/weekly_rand_single/ddl_trace.log | tail -n 20 -grep -aE "Unknown column|state=warning|ErrReachMaxTry" /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 80 -``` - -- 如果未来要正式支持 CDC 复制 `RECOVER TABLE`,需要单独设计:例如维护下游 drop/truncate job id 映射并处理路由/重试/GC,或在 recover 后做完整数据重建/快照补偿。不要把这个产品级语义问题混进 weekly random case 修复。 - -### sink-side `ErrReachMaxTry Unknown column` after dispatcher recreate - -现象: - -```text -runner failed: changefeed state is not normal: warning -[CDC:ErrReachMaxTry] ... REPLACE INTO `db2`.`t14_r_3402273` (`id`,`a`,`b`,`c`,`d`,`e`,`bin`) ... -Error 1054 (42S22): Unknown column 'bin' in 'field list' -Error 1054 (42S22): Unknown column 'e' in 'field list' -``` - -判断: - -- 这不是 random DDL worker 的普通 `Unknown column` 业务噪音。 -- 只要错误出现在 TiCDC sink retry / `ErrReachMaxTry` / changefeed warning 路径,就按同步正确性问题调查。 -- 本次固定模式是:旧 dispatcher 已经执行过 table DDL 并 stopped 到更高 checkpoint;随后迟到的 add-table barrier 又从更旧 `startTs` 创建新 dispatcher,重放 DDL 之前的 DML,打到 DDL 之后的下游 schema。 -- 延长 `converge_timeout` 不能修复这类问题;延长只解决 backlog 仍在正常推进的 timeout。 - -快速定位: - -```bash -grep -aE "ErrReachMaxTry|Unknown column|changefeed state is not normal" \ - /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 120 - -grep -aE "new span replication created|add new table|dispatcher component has stopped|send reset dispatcher request|reset dispatcher" \ - /tmp/tidb_cdc_test/weekly_rand_single/cdc.log | tail -n 240 - -grep -a "" /tmp/tidb_cdc_test/weekly_rand_single/ddl_trace.log | tail -n 80 -``` - -处理: - -- 查旧 dispatcher stopped checkpoint 是否大于新 dispatcher startTs。 -- 查新 dispatcher replay 的第一条 DML commitTs 是否小于已经执行过的 DDL commitTs。 -- 如果满足上述条件,优先检查 `maintainer/span.Controller` 的 removed table checkpoint 是否记录并 clamp 了 `AddNewTable` 起点。 -- 当前修复点:`RecordRemovedSpanCheckpoint` + `AddNewSpans` table 级 startTs clamp;move 路径还需要确保 origin stopped status 写回 `replicaSet`。 - -### log scan `panic`/`fatal` false positives in random payload - -现象: - -```text -converge done: finish mark applied downstream -runner failed: log scan found 88 panic/fatal/race matches -``` - -判断: - -- 如果 log scan 失败发生在 `converge done` 之后,先不要按 TiCDC runtime panic 处理。 -- 抽样查看 `log scan match` 对应文件/行: - -```bash -grep -aE "log scan match|runner failed|converge done" \ - /tmp/tidb_cdc_test/weekly_rand_single_5pass.log | tail -n 120 - -sed -n ",p" \ - /tmp/tidb_cdc_test/weekly_rand_single/ -``` - -- 如果命中行是 `[DEBUG]` DML event / SQL builder 日志,并且 `panic`/`fatal` 只出现在随机字符串列值里,例如: - -```text -Bb8bdTFTEIN9i3spwifGjZj3AmFAtalR -1YCs3x0WFrKYaheC3jpXpAnicxBqG3pe -``` - -则这是 log scan 误报,不是实际 panic/fatal。 - -处理: - -- 默认 `panic`/`fatal` 不能用裸 substring 扫描随机 DML payload。 -- 当前修复在 `tests/utils/random_ddl_test_runner/logscan.go` 中把默认关键字限定为真实严重日志模式: - - `fatal`: `[FATAL]`、`level=fatal`、行首 `fatal error:`。 From c9d405e9f3bfd1f2a71fcd997fe415f3a138f358 Mon Sep 17 00:00:00 2001 From: dongmen <414110582@qq.com> Date: Tue, 16 Jun 2026 20:48:32 +0800 Subject: [PATCH 3/3] tests: split downstream event replay fixes --- .../dispatcher/basic_dispatcher.go | 36 +------- .../basic_dispatcher_active_active_test.go | 85 ------------------- downstreamadapter/dispatcher/helper.go | 55 +----------- downstreamadapter/dispatcher/helper_test.go | 48 ----------- .../eventcollector/dispatcher_stat.go | 6 -- .../eventcollector/dispatcher_stat_test.go | 84 ------------------ .../eventcollector/event_collector.go | 34 +++----- pkg/eventservice/event_scanner.go | 6 +- pkg/eventservice/event_scanner_test.go | 67 +-------------- 9 files changed, 16 insertions(+), 405 deletions(-) delete mode 100644 downstreamadapter/dispatcher/helper_test.go diff --git a/downstreamadapter/dispatcher/basic_dispatcher.go b/downstreamadapter/dispatcher/basic_dispatcher.go index 4e210d7a93..72281639c6 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher.go +++ b/downstreamadapter/dispatcher/basic_dispatcher.go @@ -284,14 +284,6 @@ func (d *BasicDispatcher) AddDMLEventsToSink(events []*commonEvent.DMLEvent, wak // be rewritten into deletes when enable-active-active is disabled). filteredEvents := make([]*commonEvent.DMLEvent, 0, len(events)) for _, event := range events { - if d.blockEventStatus.isDMLCompletedOrObsolete(event.GetCommitTs()) { - log.Info("skip obsolete dml event", - zap.Stringer("dispatcher", d.id), - zap.Uint64("commitTs", event.GetCommitTs()), - zap.Uint64("seq", event.GetSeq())) - continue - } - // FilterDMLEvent returns the original event for normal tables and only // allocates a new event when the table needs active-active or soft-delete // processing. Skip is true when every row in the event is dropped, or when @@ -910,10 +902,6 @@ func (d *BasicDispatcher) reportBlockedEventDone( actionCommitTs uint64, actionIsSyncPoint bool, ) { - d.blockEventStatus.recordCompleted(BlockEventIdentifier{ - CommitTs: actionCommitTs, - IsSyncPoint: actionIsSyncPoint, - }) d.offerDoneBlockStatus(actionCommitTs, actionIsSyncPoint) GetDispatcherStatusDynamicStream().Wake(d.id) } @@ -997,9 +985,7 @@ func (d *BasicDispatcher) DealWithBlockEvent(event commonEvent.BlockEvent) { shouldBlock := d.shouldBlock(event) shouldHoldBlocked := d.shouldHoldBlockEvent(event) if shouldBlock && shouldHoldBlocked { - if !d.completeObsoleteBlockEvent(event) { - d.holdBlockEvent(event) - } + d.holdBlockEvent(event) return } // Writing a block event may involve downstream IO (e.g. executing DDL), so it must not block @@ -1027,9 +1013,6 @@ func (d *BasicDispatcher) DealWithBlockEvent(event commonEvent.BlockEvent) { } if shouldBlock { failpoint.Inject("BlockAfterFlush", nil) - if d.completeObsoleteBlockEvent(event) { - return - } d.reportBlockedEventToMaintainer(event) return } @@ -1212,20 +1195,6 @@ func (d *BasicDispatcher) reportBlockedEventToMaintainer(event commonEvent.Block d.offerBlockStatus(status) } -func (d *BasicDispatcher) completeObsoleteBlockEvent(event commonEvent.BlockEvent) bool { - if !d.blockEventStatus.isCompletedOrObsolete(event) { - return false - } - identifier := blockEventIdentifier(event) - log.Info("skip obsolete block event", - zap.Stringer("dispatcher", d.id), - zap.Uint64("commitTs", identifier.CommitTs), - zap.Bool("isSyncPoint", identifier.IsSyncPoint)) - d.PassBlockEventToSink(event) - d.reportBlockedEventDone(identifier.CommitTs, identifier.IsSyncPoint) - return true -} - func (d *BasicDispatcher) flushBlockedEventAndReportToMaintainer(event commonEvent.BlockEvent) { d.sharedInfo.GetBlockEventExecutor().Submit(d, func() { failpoint.Inject("BlockOrWaitBeforeFlush", nil) @@ -1234,9 +1203,6 @@ func (d *BasicDispatcher) flushBlockedEventAndReportToMaintainer(event commonEve return } failpoint.Inject("BlockAfterFlush", nil) - if d.completeObsoleteBlockEvent(event) { - return - } d.reportBlockedEventToMaintainer(event) }) } diff --git a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go index ca6f581b63..38d112571c 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go @@ -13,9 +13,7 @@ package dispatcher import ( - "context" "testing" - "time" "github.com/pingcap/ticdc/heartbeatpb" "github.com/pingcap/ticdc/pkg/common" @@ -128,89 +126,6 @@ func TestDDLEventsAlwaysValidateActiveActive(t *testing.T) { } } -func TestHandleEventsSkipsDMLBeforeCompletedBlockEvent(t *testing.T) { - sharedInfo := newTestSharedInfo(false, false, nil) - dispatcherSink := newDispatcherTestSink(t, common.MysqlSinkType) - tableSpan := &heartbeatpb.TableSpan{TableID: 1, StartKey: []byte{0}, EndKey: []byte{1}} - dispatcher := NewBasicDispatcher( - common.NewDispatcherID(), - tableSpan, - 100, - 1, - NewSchemaIDToDispatchers(), - false, - false, - 4096, - 0, - 200, - common.DefaultMode, - dispatcherSink.Sink(), - sharedInfo, - ) - - helper := commonEvent.NewEventTestHelper(t) - defer helper.Close() - helper.Tk().MustExec("use test") - helper.DDL2Event("create table t (id int primary key, v int)") - oldDML := helper.DML2Event("test", "t", "insert into t values (1, 1)") - oldDML.DispatcherID = dispatcher.id - oldDML.StartTs = 110 - oldDML.CommitTs = 120 - newDML := helper.DML2Event("test", "t", "insert into t values (2, 2)") - newDML.DispatcherID = dispatcher.id - newDML.StartTs = 130 - newDML.CommitTs = 140 - - dispatcher.blockEventStatus.recordCompleted(BlockEventIdentifier{CommitTs: 120}) - block := dispatcher.handleEvents([]DispatcherEvent{{Event: oldDML}, {Event: newDML}}, func() {}) - require.True(t, block) - - dmls := dispatcherSink.GetDMLs() - require.Len(t, dmls, 1) - require.Equal(t, uint64(140), dmls[0].CommitTs) -} - -func TestHeldObsoleteBlockEventCompletesWithoutWaitingReport(t *testing.T) { - sharedInfo := newTestSharedInfo(false, false, nil) - dispatcherSink := newDispatcherTestSink(t, common.MysqlSinkType) - dispatcherID := common.NewDispatcherID() - dispatcher := NewBasicDispatcher( - dispatcherID, - common.KeyspaceDDLSpan(common.DefaultKeyspaceID), - 100, - common.DDLSpanSchemaID, - NewSchemaIDToDispatchers(), - false, - false, - 4096, - 0, - 200, - common.DefaultMode, - dispatcherSink.Sink(), - sharedInfo, - ) - - event := commonEvent.NewSyncPointEvent(dispatcherID, 120, 1, 0) - dispatcher.pendingACKCount.Store(1) - dispatcher.DealWithBlockEvent(event) - require.NotNil(t, dispatcher.holdingBlockEvent) - require.Equal(t, 0, dispatcher.resendTaskMap.Len()) - - dispatcher.blockEventStatus.recordCompleted(BlockEventIdentifier{CommitTs: 120, IsSyncPoint: true}) - dispatcher.pendingACKCount.Store(0) - dispatcher.tryDealWithHeldBlockEvent() - - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - status := dispatcher.TakeBlockStatus(ctx) - require.NotNil(t, status) - require.Equal(t, heartbeatpb.BlockStage_DONE, status.State.Stage) - require.Equal(t, uint64(120), status.State.BlockTs) - require.True(t, status.State.IsSyncPoint) - require.Equal(t, 0, dispatcher.resendTaskMap.Len()) - require.Nil(t, dispatcher.blockEventStatus.getEvent()) -} - func newTestBasicDispatcher(t *testing.T, sinkType common.SinkType, enableActiveActive bool) *BasicDispatcher { t.Helper() sharedInfo := newTestSharedInfo(enableActiveActive, false, nil) diff --git a/downstreamadapter/dispatcher/helper.go b/downstreamadapter/dispatcher/helper.go index 580464df6c..c76b333832 100644 --- a/downstreamadapter/dispatcher/helper.go +++ b/downstreamadapter/dispatcher/helper.go @@ -86,8 +86,6 @@ type BlockEventStatus struct { blockPendingEvent commonEvent.BlockEvent blockStage heartbeatpb.BlockStage blockCommitTs uint64 - completed BlockEventIdentifier - hasCompleted bool } func (b *BlockEventStatus) clear() { @@ -108,33 +106,6 @@ func (b *BlockEventStatus) setBlockEvent(event commonEvent.BlockEvent, blockStag b.blockCommitTs = event.GetCommitTs() } -func (b *BlockEventStatus) isCompletedOrObsolete(event commonEvent.BlockEvent) bool { - b.mutex.Lock() - defer b.mutex.Unlock() - - if !b.hasCompleted { - return false - } - return compareBlockEventIdentifier(blockEventIdentifier(event), b.completed) <= 0 -} - -func (b *BlockEventStatus) isDMLCompletedOrObsolete(commitTs uint64) bool { - b.mutex.Lock() - defer b.mutex.Unlock() - - return b.hasCompleted && commitTs <= b.completed.CommitTs -} - -func (b *BlockEventStatus) recordCompleted(identifier BlockEventIdentifier) { - b.mutex.Lock() - defer b.mutex.Unlock() - - if !b.hasCompleted || compareBlockEventIdentifier(identifier, b.completed) > 0 { - b.completed = identifier - b.hasCompleted = true - } -} - func (b *BlockEventStatus) updateBlockStage(blockStage heartbeatpb.BlockStage) { b.mutex.Lock() defer b.mutex.Unlock() @@ -168,8 +139,7 @@ func (b *BlockEventStatus) actionMatchs(action *heartbeatpb.DispatcherAction) bo return false } - pendingIsSyncPoint := b.blockPendingEvent.GetType() == commonEvent.TypeSyncPointEvent - return b.blockCommitTs == action.CommitTs && pendingIsSyncPoint == action.IsSyncPoint + return b.blockCommitTs == action.CommitTs } // ignoredStatusMatches checks whether the ignored status is for the current pending ddl/sync point event. @@ -199,29 +169,6 @@ func (b *BlockEventStatus) getEventCommitTs() (uint64, bool) { return b.blockCommitTs, true } -func blockEventIdentifier(event commonEvent.BlockEvent) BlockEventIdentifier { - return BlockEventIdentifier{ - CommitTs: event.GetCommitTs(), - IsSyncPoint: event.GetType() == commonEvent.TypeSyncPointEvent, - } -} - -func compareBlockEventIdentifier(a, b BlockEventIdentifier) int { - if a.CommitTs < b.CommitTs { - return -1 - } - if a.CommitTs > b.CommitTs { - return 1 - } - if a.IsSyncPoint == b.IsSyncPoint { - return 0 - } - if !a.IsSyncPoint && b.IsSyncPoint { - return -1 - } - return 1 -} - type SchemaIDToDispatchers struct { mutex sync.RWMutex m map[int64]map[common.DispatcherID]interface{} diff --git a/downstreamadapter/dispatcher/helper_test.go b/downstreamadapter/dispatcher/helper_test.go deleted file mode 100644 index e811a41e19..0000000000 --- a/downstreamadapter/dispatcher/helper_test.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package dispatcher - -import ( - "testing" - - "github.com/pingcap/ticdc/heartbeatpb" - "github.com/pingcap/ticdc/pkg/common" - commonEvent "github.com/pingcap/ticdc/pkg/common/event" - "github.com/stretchr/testify/require" -) - -func TestBlockEventStatusCompletedWatermark(t *testing.T) { - var status BlockEventStatus - ddl10 := &commonEvent.DDLEvent{FinishedTs: 10} - syncpoint10 := commonEvent.NewSyncPointEvent(common.NewDispatcherID(), 10, 1, 0) - ddl11 := &commonEvent.DDLEvent{FinishedTs: 11} - - status.recordCompleted(BlockEventIdentifier{CommitTs: 10, IsSyncPoint: false}) - require.True(t, status.isCompletedOrObsolete(ddl10)) - require.False(t, status.isCompletedOrObsolete(syncpoint10)) - require.False(t, status.isCompletedOrObsolete(ddl11)) - - status.recordCompleted(BlockEventIdentifier{CommitTs: 10, IsSyncPoint: true}) - require.True(t, status.isCompletedOrObsolete(ddl10)) - require.True(t, status.isCompletedOrObsolete(syncpoint10)) - require.False(t, status.isCompletedOrObsolete(ddl11)) -} - -func TestBlockEventStatusActionMatchesSyncPointFlag(t *testing.T) { - var status BlockEventStatus - status.setBlockEvent(&commonEvent.DDLEvent{FinishedTs: 10}, heartbeatpb.BlockStage_WAITING) - - require.True(t, status.actionMatchs(&heartbeatpb.DispatcherAction{CommitTs: 10})) - require.False(t, status.actionMatchs(&heartbeatpb.DispatcherAction{CommitTs: 10, IsSyncPoint: true})) -} diff --git a/downstreamadapter/eventcollector/dispatcher_stat.go b/downstreamadapter/eventcollector/dispatcher_stat.go index 0b5b1164dc..0153b3dc16 100644 --- a/downstreamadapter/eventcollector/dispatcher_stat.go +++ b/downstreamadapter/eventcollector/dispatcher_stat.go @@ -137,12 +137,6 @@ func (d *dispatcherStat) advanceEpochForReset(resetTs uint64) uint64 { currentState := d.loadCurrentEpochState() nextState := newDispatcherEpochState(currentState.epoch+1, 0, resetTs) if d.currentEpoch.CompareAndSwap(currentState, nextState) { - // The new epoch replays events from resetTs. Commit-ts based - // deduplication from the old epoch must not filter replayed DDL or - // SyncPoint events. - d.lastEventCommitTs.Store(resetTs) - d.gotDDLOnTs.Store(false) - d.gotSyncpointOnTS.Store(false) return nextState.epoch } } diff --git a/downstreamadapter/eventcollector/dispatcher_stat_test.go b/downstreamadapter/eventcollector/dispatcher_stat_test.go index 05bdcbf243..9c3088a6eb 100644 --- a/downstreamadapter/eventcollector/dispatcher_stat_test.go +++ b/downstreamadapter/eventcollector/dispatcher_stat_test.go @@ -29,7 +29,6 @@ import ( "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/messaging" "github.com/pingcap/ticdc/pkg/node" - "github.com/pingcap/ticdc/utils/dynstream" "github.com/pingcap/tidb/pkg/util/chunk" "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/oracle" @@ -553,53 +552,6 @@ func TestUpdateCommitTsStateByEvents(t *testing.T) { require.Equal(t, uint64(110), state.maxEventTs.Load()) } -func TestAdvanceEpochForResetClearsCommitTsFilter(t *testing.T) { - t.Parallel() - - dispatcherID := common.NewDispatcherID() - eventServiceID := node.ID("event-service-1") - mockDisp := newMockDispatcher(dispatcherID, 100) - mockDisp.handleEvents = func(events []dispatcher.DispatcherEvent, wakeCallback func()) (block bool) { - return len(events) > 0 - } - - stat := newDispatcherStatForTest(mockDisp, nil) - stat.currentEpoch.Store(newDispatcherEpochState(10, 3, stat.target.GetStartTs())) - stat.lastEventCommitTs.Store(220) - stat.gotDDLOnTs.Store(true) - stat.gotSyncpointOnTS.Store(true) - - epoch := stat.advanceEpochForReset(150) - require.Equal(t, uint64(11), epoch) - require.Equal(t, uint64(150), stat.lastEventCommitTs.Load()) - require.False(t, stat.gotDDLOnTs.Load()) - require.False(t, stat.gotSyncpointOnTS.Load()) - - handshake := commonEvent.NewHandshakeEvent(dispatcherID, 160, epoch, &common.TableInfo{}) - stat.handleHandshakeEvent(dispatcher.DispatcherEvent{ - From: &eventServiceID, - Event: &handshake, - }) - - ddl := &commonEvent.DDLEvent{ - Version: commonEvent.DDLEventVersion1, - FinishedTs: 180, - Seq: 2, - Epoch: epoch, - } - require.True(t, stat.handleSingleDataEvents([]dispatcher.DispatcherEvent{ - { - From: &eventServiceID, - Event: ddl, - }, - })) - require.Len(t, mockDisp.events, 1) - require.Same(t, ddl, mockDisp.events[0].Event) - require.Equal(t, uint64(180), stat.lastEventCommitTs.Load()) - require.True(t, stat.gotDDLOnTs.Load()) - require.False(t, stat.gotSyncpointOnTS.Load()) -} - func TestHandleSignalEvent(t *testing.T) { localServerID := node.ID("local-server") remoteServerID := node.ID("remote-server") @@ -945,42 +897,6 @@ func TestInitialLocalReadyCallbackIsOneShot(t *testing.T) { requireNoDispatcherRequest(t, mockEventCollector) } -func TestReleasePathFeedbackResetsCurrentEventService(t *testing.T) { - localServerID := node.ID("local-server") - dispatcherID := common.NewDispatcherID() - cfID := common.NewChangeFeedIDWithName("release_path_test", common.DefaultKeyspaceName) - mockDisp := newMockDispatcher(dispatcherID, 10) - mockDisp.changefeedID = cfID - mockDisp.checkPointTs = 20 - mockEventCollector := newTestEventCollector(localServerID) - stat := newDispatcherStat(mockDisp, mockEventCollector, nil) - setSessionState(stat.session, localServerID, false, "") - mockEventCollector.dispatcherMap.Store(dispatcherID, stat) - mockEventCollector.changefeedMap.Store(cfID.ID(), newChangefeedStat(cfID)) - - released := false - feedback := dynstream.Feedback[common.GID, common.DispatcherID, *dispatcherStat]{ - Area: cfID.ID(), - Path: dispatcherID, - FeedbackType: dynstream.ReleasePath, - } - mockEventCollector.handleReleasePathFeedback(feedback, func(path common.DispatcherID) { - released = true - require.Equal(t, dispatcherID, path) - }, "DS") - - require.True(t, released) - cfStatValue, ok := mockEventCollector.changefeedMap.Load(cfID.ID()) - require.True(t, ok) - require.Equal(t, uint32(1), cfStatValue.(*changefeedStat).memoryReleaseCount.Load()) - requireDispatcherRequests( - t, - readDispatcherRequests(t, mockEventCollector, 1), - dispatcherRequestRecord{to: localServerID, action: eventpb.ActionType_ACTION_TYPE_RESET}, - ) - requireNoDispatcherRequest(t, mockEventCollector) -} - func TestIsFromCurrentEpoch(t *testing.T) { t.Parallel() diff --git a/downstreamadapter/eventcollector/event_collector.go b/downstreamadapter/eventcollector/event_collector.go index 7319279a54..6456f1ada2 100644 --- a/downstreamadapter/eventcollector/event_collector.go +++ b/downstreamadapter/eventcollector/event_collector.go @@ -448,38 +448,24 @@ func (c *EventCollector) processDSFeedback(ctx context.Context) error { return context.Cause(ctx) case feedback := <-c.ds.Feedback(): if feedback.FeedbackType == dynstream.ReleasePath { - c.handleReleasePathFeedback(feedback, c.ds.Release, "DS") + if v, ok := c.changefeedMap.Load(feedback.Area); ok { + v.(*changefeedStat).memoryReleaseCount.Add(1) + } + log.Info("release dispatcher memory in DS", zap.Any("dispatcherID", feedback.Path)) + c.ds.Release(feedback.Path) } case feedback := <-c.redoDs.Feedback(): if feedback.FeedbackType == dynstream.ReleasePath { - c.handleReleasePathFeedback(feedback, c.redoDs.Release, "redo DS") + if v, ok := c.changefeedMap.Load(feedback.Area); ok { + v.(*changefeedStat).memoryReleaseCount.Add(1) + } + log.Info("release dispatcher memory in redo DS", zap.Any("dispatcherID", feedback.Path)) + c.redoDs.Release(feedback.Path) } } } } -func (c *EventCollector) handleReleasePathFeedback( - feedback dynstream.Feedback[common.GID, common.DispatcherID, *dispatcherStat], - release func(common.DispatcherID), - streamName string, -) { - if v, ok := c.changefeedMap.Load(feedback.Area); ok { - v.(*changefeedStat).memoryReleaseCount.Add(1) - } - log.Info("release dispatcher memory in "+streamName, zap.Any("dispatcherID", feedback.Path)) - release(feedback.Path) - - stat := c.getDispatcherStatByID(feedback.Path) - if stat == nil { - return - } - log.Info("reset dispatcher after releasing queued events", - zap.Stringer("changefeedID", stat.target.GetChangefeedID()), - zap.Stringer("dispatcherID", feedback.Path), - zap.String("stream", streamName)) - stat.session.resetCurrentEventService() -} - func (c *EventCollector) sendDispatcherRequests(ctx context.Context) error { for { select { diff --git a/pkg/eventservice/event_scanner.go b/pkg/eventservice/event_scanner.go index d4a948e870..b082ccfc13 100644 --- a/pkg/eventservice/event_scanner.go +++ b/pkg/eventservice/event_scanner.go @@ -229,11 +229,9 @@ func (s *eventScanner) scanAndMergeEvents( if err != nil { return false, err } - // The table has been deleted, so the current raw event cannot be - // decoded as DML. Resolve to its commit ts to skip it; resolving to - // rawEvent.CRTs-1 can equal the scan start and cause a no-progress loop. + // table is deleted, still append remaining DDL event and resolved event. if tableInfo == nil { - err = finalizeScan(merger, processor, session, rawEvent.CRTs) + err = finalizeScan(merger, processor, session, rawEvent.CRTs-1) return false, err } diff --git a/pkg/eventservice/event_scanner_test.go b/pkg/eventservice/event_scanner_test.go index bbf1cbdeb9..7a76adeae9 100644 --- a/pkg/eventservice/event_scanner_test.go +++ b/pkg/eventservice/event_scanner_test.go @@ -449,7 +449,6 @@ func TestEventScannerWithDeleteTable(t *testing.T) { dml0 := kvEvents[0] dml1 := kvEvents[1] dml2 := kvEvents[2] - dml3 := kvEvents[3] mockSchemaStore.DeleteTable(tableID, dml2.CRTs) disp.receivedResolvedTs.Store(resolvedTs) ok, dataRange := broker.getScanTaskDataRange(disp) @@ -481,12 +480,10 @@ func TestEventScannerWithDeleteTable(t *testing.T) { require.Equal(t, batchDML1.DMLEvents[0].GetCommitTs(), dml1.CRTs) require.Equal(t, batchDML1.DMLEvents[1].GetCommitTs(), dml2.CRTs) - // resolvedTs skips the first raw event after the table is deleted, so the - // next scan range will not keep seeing the same deleted-table event. + // resolvedTs e = events[3] require.Equal(t, e.GetType(), event.TypeResolvedEvent) - require.Equal(t, dml3.CRTs, e.GetCommitTs()) - require.Greater(t, e.GetCommitTs(), dml2.CRTs) + require.Equal(t, dml2.CRTs, e.GetCommitTs()) } // TestEventScannerWithDDL tests cases where scanning is interrupted at DDL events @@ -1570,66 +1567,6 @@ func TestScanAndMergeEventsSingleUKUpdate(t *testing.T) { require.True(t, sess.scannedBytes > 0) // Some bytes were processed } -func TestScanAndMergeEventsSkipsDeletedTableTxn(t *testing.T) { - helper := event.NewEventTestHelper(t) - defer helper.Close() - - ddlEvent, kvEvents := genEvents(helper, - `create table test.t_deleted(id int primary key, c char(50))`, - `insert into test.t_deleted(id,c) values (1, "c1")`) - require.Len(t, kvEvents, 1) - rawEvent := kvEvents[0] - tableID := ddlEvent.GetTableID() - - schemaStore := &schemaStoreWithErr{ - mockSchemaStore: NewMockSchemaStore(), - getTableInfoError: &schemastore.TableDeletedError{}, - } - scanner := &eventScanner{ - mounter: &mockMounter{}, - schemaGetter: schemaStore, - } - - disInfo := newMockDispatcherInfoForTest(t) - disInfo.span.TableID = tableID - dispatcherID := common.NewDispatcherID() - disp := &dispatcherStat{ - info: disInfo, - id: dispatcherID, - isRemoved: atomic.Bool{}, - } - - dataRange := common.DataRange{ - Span: &heartbeatpb.TableSpan{ - TableID: tableID, - }, - CommitTsStart: rawEvent.CRTs - 1, - CommitTsEnd: rawEvent.CRTs + 100, - } - sess := &session{ - ctx: context.Background(), - dispatcherStat: disp, - dataRange: dataRange, - startTime: time.Now(), - events: make([]event.Event, 0), - } - merger := newEventMerger(nil) - - isInterrupted, err := scanner.scanAndMergeEvents(sess, merger, &mockEventIterator{ - events: []*common.RawKVEntry{rawEvent}, - }) - require.NoError(t, err) - require.False(t, isInterrupted) - require.Zero(t, sess.dmlCount) - require.Len(t, sess.events, 1) - - resolvedEvent, ok := sess.events[0].(event.ResolvedEvent) - require.True(t, ok) - require.Equal(t, dispatcherID, resolvedEvent.DispatcherID) - require.Equal(t, rawEvent.CRTs, resolvedEvent.ResolvedTs) - require.Greater(t, resolvedEvent.ResolvedTs, dataRange.CommitTsStart) -} - type schemaStoreWithErr struct { *mockSchemaStore getTableInfoError error