Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/actions/setup-integration/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ runs:
sudo apt-get update
sudo apt-get install -y postgresql-client-17

- name: Clean up stale containers and free ports
shell: bash
run: |
docker compose -f docker-compose.test.yml down -v 2>/dev/null || true
docker rm -f pg_dbmigrator_source pg_dbmigrator_target 2>/dev/null || true
sudo systemctl stop postgresql 2>/dev/null || true

- name: Start docker-compose stack
shell: bash
run: docker compose -f docker-compose.test.yml up -d --wait
run: docker compose -f docker-compose.test.yml up -d --wait --force-recreate
42 changes: 42 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,46 @@ jobs:
if: always()
run: docker compose -f docker-compose.test.yml down -v

integration-online-auto-pub-lifecycle:
name: Integration test (online, auto-create publication + post-cutover cleanup)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- uses: ./.github/actions/setup-integration

- name: Run online auto-pub lifecycle e2e
run: bash tests/integration/run_online_auto_pub_lifecycle.sh

- name: Show docker logs on failure
if: failure()
run: docker compose -f docker-compose.test.yml logs

- name: Tear down
if: always()
run: docker compose -f docker-compose.test.yml down -v

integration-online-keep-slot:
name: Integration test (online, keep-slot flag retains slot + pre-existing publication)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- uses: ./.github/actions/setup-integration

- name: Run online keep-slot e2e
run: bash tests/integration/run_online_keep_slot.sh

- name: Show docker logs on failure
if: failure()
run: docker compose -f docker-compose.test.yml logs

- name: Tear down
if: always()
run: docker compose -f docker-compose.test.yml down -v

publish:
name: Publish to crates.io
if: startsWith(github.ref, 'refs/tags/v')
Expand All @@ -299,6 +339,8 @@ jobs:
- integration-online-cancel-resume
- integration-online-multi-resume-sustained
- integration-online-sequence-sync
- integration-online-auto-pub-lifecycle
- integration-online-keep-slot
runs-on: ubuntu-latest
permissions:
contents: write
Expand Down
609 changes: 168 additions & 441 deletions Agent.md

Large diffs are not rendered by default.

88 changes: 73 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ keeps PostgreSQL's built-in logical replication apply worker pulling from
the source so the operator can cut over with near-zero downtime.

The online path issues `CREATE SUBSCRIPTION` on the target attached to a
slot we created with `EXPORT_SNAPSHOT` before `pg_dump` ran.
slot we created with `EXPORT_SNAPSHOT` before `pg_dump` ran.

## Modes

Expand All @@ -24,9 +24,14 @@ slot we created with `EXPORT_SNAPSHOT` before `pg_dump` ran.
### Online migration phases

```
Validate → SourceVacuum → PrepareSnapshot → Dump → Restore → Analyze → StreamApply → (Lag heartbeat …) → CaughtUp → Cutover → Complete
Validate → SourceVacuum → PrepareSnapshot → Dump → Restore → Analyze → StreamApply → (Lag heartbeat …) → CaughtUp → Cutover → SourceCleanup → Complete
```

* `Validate` pre-flights the source (`wal_level = 'logical'`,
`max_replication_slots > 0`, `max_wal_senders > 0`) and ensures the
required publication exists — auto-creating it if missing (see
[Publication lifecycle](#publication--replication-resource-lifecycle)
below).
* `SourceVacuum` runs `VACUUM ANALYZE` on the source to reclaim dead tuples
and refresh planner statistics before the dump. Skip with `--skip-source-vacuum`.
* `PrepareSnapshot` creates the replication slot first; `START_REPLICATION`
Expand All @@ -41,13 +46,15 @@ Validate → SourceVacuum → PrepareSnapshot → Dump → Restore → Analyze
signal the customer watches to decide when to cut over.
* When the lag drops at or below `--lag-threshold-bytes` a one-shot
`CaughtUp` event is emitted (“ready for cutover”).
* `SourceCleanup` (after cutover) drops auto-created publications and
replication slots on the source — see the next section.

## Install / build

```bash
# Install the CLI from source. Produces a binary called `pg_dbmigrator`.
cargo install pg_dbmigrator
pg_dbmigrator --help
pg_dbmigrator --help
pg_dbmigrator --mode offline --source '…' --target '…' --jobs 4
```

Expand Down Expand Up @@ -75,9 +82,19 @@ On the source, before starting:

```sql
ALTER SYSTEM SET wal_level = 'logical'; -- requires restart
CREATE PUBLICATION pg_dbmigrator_pub FOR ALL TABLES;
```

The publication is auto-created by the migrator if it does not already
exist (default `FOR ALL TABLES`). If you prefer to create it manually —
e.g. to publish only specific tables — run:

```sql
CREATE PUBLICATION pg_dbmigrator_pub FOR TABLE my_schema.t1, my_schema.t2;
```

pass `--no-auto-create-publication` so the migrator uses the existing
one and does not attempt to create or drop it.

```bash
pg_dbmigrator \
--mode online \
Expand All @@ -91,11 +108,6 @@ pg_dbmigrator \
--cutover-poll-secs 5
```

The library creates a subscription called `--subscription-name` (default
`pg_dbmigrator_sub`) on the target attached to the existing slot. On cutover
the subscription is disabled and, unless `--keep-subscription` is set,
dropped.

Before the dump runs, the migrator pre-flights the source:
`wal_level = 'logical'`, `max_replication_slots > 0`, and
`max_wal_senders > 0`. A misconfigured source fails fast with a clear
Expand All @@ -119,6 +131,33 @@ pg_dbmigrator --mode offline \
--exclude-table public.large_log
```

## Publication / replication resource lifecycle

The migrator fully manages the lifecycle of the replication resources it
creates, so the operator does not need to run manual cleanup SQL after a
successful cutover.

| Resource | Created by | Cleaned up at cutover | Override |
|---|---|---|---|
| Publication on source | Auto-created if missing (default) | Dropped only if it was auto-created | `--no-auto-create-publication` |
| Replication slot on source | Always created by the migrator | Dropped by default | `--keep-slot` |
| Subscription on target | Always created by the migrator | Dropped by default | `--keep-subscription` |

**Auto-create publication**: By default, the migrator checks whether the
named publication (`--publication`, default `pg_dbmigrator_pub`) exists on
the source. If it does not, the migrator creates it as `FOR ALL TABLES`
(or scoped to `--table` / `--schema` if specified). Auto-created
publications are tracked and dropped on the source after a successful
cutover. Pre-existing publications are never dropped.

**Slot cleanup**: After cutover, the replication slot on the source is no
longer needed. By default the migrator drops it. Pass `--keep-slot` if
you need to inspect the slot post-migration or if another consumer
shares it.

All cleanup steps are best-effort — failures are logged as warnings but
do not abort the migration.

## Cutover (online mode)

Cutover is driven by `SIGINT` (Ctrl+C). The CLI prints a periodic `Lag` heartbeat after the dump completes, so the operator has a continuous bytes-behind read-out:
Expand All @@ -135,19 +174,38 @@ When the customer is satisfied with the lag, they press **Ctrl+C** once:
* The streaming apply loop notices the request on its next poll, flushes
the last LSN feedback to the source, emits a `Cutover` event, and
returns.
* `Migrator::run` returns with `MigrationOutcome::cutover_triggered()
== true`. The process exits cleanly. Application traffic can now be
switched to the target.
* The migrator syncs sequences, cleans up replication resources (publication,
slot, subscription), and returns with
`MigrationOutcome::cutover_triggered() == true`. The process exits
cleanly. Application traffic can now be switched to the target.
* A second Ctrl+C is treated as an abort (escape hatch — only use it if
the graceful path is stuck).

Cutover is always operator-driven; `--lag-threshold-bytes` is purely advisory and only controls when the one-shot `CaughtUp` "ready for cutover" event fires.
Cutover is always operator-driven; `--lag-threshold-bytes` is purely advisory and only controls when the one-shot `CaughtUp` ready for cutover event fires.

For online migrations, hold on to `migrator.cutover_handle()` and call `request()` from your own signal handler / RPC endpoint when the operator is ready to cut over. See [`examples/online_migration`](examples/online_migration) for a complete program that wires Ctrl+C to the cutover handle.

## Performance defaults

The CLI ships with sensible defaults tuned for migration speed. Override
only when you have a specific reason.

| Default | Flag to override | Effect |
|---|---|---|
| Split-section restore | `--no-split-sections` | Bulk COPY without index maintenance, then rebuild indexes in parallel. 30-60% faster on index-heavy schemas. |
| `lz4:1` dump compression | `--dump-compress <spec>` | Negligible CPU, 3-5x smaller archive. Use `zstd:3` for better ratio, `none` to disable. |
| `--no-sync` on dump | `--keep-sync` | Skip fsync on transient dump files. |
| `--no-comments` | _(not exposed)_ | Omit COMMENT ON statements from dump. |
| `--no-security-labels` | _(not exposed)_ | Omit SE-Linux security labels from dump. |
| `--no-publications` | `--keep-publications` | Don't dump publication definitions to the target. |
| `--no-subscriptions` | `--keep-subscriptions` | Don't dump subscription definitions to the target. |
| Auto-detect `--jobs` | `--jobs N` | Clamps to `[1, 8]` based on host CPU count. |
| Pre-dump `VACUUM ANALYZE` | `--skip-source-vacuum` | Clean heap pages + fresh stats before dump. |
| Post-restore `ANALYZE` | `--skip-analyze` | Fresh planner stats on target immediately after restore. |

## Benchmark

See [BENCHMARK.md](BENCHMARK.md) for migration performance results across 10 GB 200 GB datasets (PG 16 PG 18, 8 parallel jobs, zstd compression).
See [BENCHMARK.md](BENCHMARK.md) for migration performance results across 10 GB -- 200 GB datasets (PG 16 -> PG 18, 8 parallel jobs, zstd compression).

## Known limitations

Expand All @@ -156,7 +214,7 @@ See [BENCHMARK.md](BENCHMARK.md) for migration performance results across 10 GB
* DDL changes are not migrated automatically — refresh the publication
and restart the migration if the schema changes during the run.
* Extensions whose internal state cannot be re-created on the target
(Azure-reserved extensions, pg_cron metadata, ) may cause
(Azure-reserved extensions, pg_cron metadata, ...) may cause
`pg_restore` to exit with code 1. Pass `--allow-restore-errors` to
treat that as a non-fatal warning when user data was restored
successfully.
Expand Down
2 changes: 2 additions & 0 deletions examples/online_migration/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ async fn main() -> Result<()> {
drop_subscription_on_cutover: !env_flag("PG_DBMIGRATOR_KEEP_SUBSCRIPTION"),
force_clean: env_flag("PG_DBMIGRATOR_FORCE_CLEAN"),
sync_sequences_on_cutover: !env_flag("PG_DBMIGRATOR_NO_SEQUENCE_SYNC"),
auto_create_publication: !env_flag("PG_DBMIGRATOR_NO_AUTO_CREATE_PUB"),
drop_slot_on_cutover: !env_flag("PG_DBMIGRATOR_KEEP_SLOT"),
apply: ReplicationApplyConfig {
feedback_interval: Duration::from_secs(env_secs("PG_DBMIGRATOR_FEEDBACK_SECS", 5)),
connection_timeout: Duration::from_secs(15),
Expand Down
56 changes: 56 additions & 0 deletions pg_dbmigrator/src/bin/pg_dbmigrator/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ pub struct Cli {
#[arg(long)]
pub no_sequence_sync: bool,

/// Disable auto-creation of the publication on the source. By default the migrator creates the publication automatically if it does not exist. Pass this flag to require the operator to create it manually before running the migration. Online mode only.
#[arg(long)]
pub no_auto_create_publication: bool,

/// Keep the replication slot on the source after cutover (default: drop it). Online mode only.
#[arg(long)]
pub keep_slot: bool,

/// pgoutput protocol version.
#[arg(long, default_value_t = 2)]
pub protocol_version: u32,
Expand Down Expand Up @@ -285,6 +293,8 @@ impl Cli {
drop_subscription_on_cutover: !self.keep_subscription,
force_clean: self.force_clean,
sync_sequences_on_cutover: !self.no_sequence_sync,
auto_create_publication: !self.no_auto_create_publication,
drop_slot_on_cutover: !self.keep_slot,
apply,
cutover: CutoverConfig {
poll_interval: std::time::Duration::from_secs(self.cutover_poll_secs),
Expand Down Expand Up @@ -730,4 +740,50 @@ mod tests {
assert!(!cfg.skip_analyze);
assert!(!cfg.skip_source_vacuum);
}

#[test]
fn into_config_no_auto_create_publication() {
let cli = parse_args(&[
"pg_dbmigrator",
"--mode",
"online",
"--source",
"postgresql://u@src/db",
"--target",
"postgresql://u@dst/db",
"--no-auto-create-publication",
]);
let cfg = cli.into_config().unwrap();
assert!(!cfg.online.auto_create_publication);
}

#[test]
fn into_config_keep_slot() {
let cli = parse_args(&[
"pg_dbmigrator",
"--mode",
"online",
"--source",
"postgresql://u@src/db",
"--target",
"postgresql://u@dst/db",
"--keep-slot",
]);
let cfg = cli.into_config().unwrap();
assert!(!cfg.online.drop_slot_on_cutover);
}

#[test]
fn into_config_defaults_auto_create_publication_and_drop_slot() {
let cli = parse_args(&[
"pg_dbmigrator",
"--source",
"postgresql://u@src/db",
"--target",
"postgresql://u@dst/db",
]);
let cfg = cli.into_config().unwrap();
assert!(cfg.online.auto_create_publication);
assert!(cfg.online.drop_slot_on_cutover);
}
}
28 changes: 28 additions & 0 deletions pg_dbmigrator/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,12 @@ pub struct OnlineOptions {
/// out-of-band sequence sync (e.g. application-level UUIDs).
#[serde(default = "default_true")]
pub sync_sequences_on_cutover: bool,
/// If `true` (default), automatically create the publication on the source if it does not already exist. The publication will be created as `FOR ALL TABLES` (or scoped by `MigrationConfig::tables` / `MigrationConfig::schemas` if set). Publications auto-created by the migrator are tracked and dropped after cutover.
#[serde(default = "default_true")]
pub auto_create_publication: bool,
/// If `true` (default), drop the replication slot on the source after a successful cutover. The slot is no longer needed once the target has caught up and the subscription is torn down.
#[serde(default = "default_true")]
pub drop_slot_on_cutover: bool,
/// Configuration for the WAL apply worker.
pub apply: ReplicationApplyConfig,
/// Cutover knobs — when to declare the target "caught up" and how the
Expand All @@ -414,6 +420,8 @@ impl Default for OnlineOptions {
drop_subscription_on_cutover: true,
force_clean: false,
sync_sequences_on_cutover: true,
auto_create_publication: true,
drop_slot_on_cutover: true,
apply: ReplicationApplyConfig::default(),
cutover: CutoverConfig::default(),
}
Expand Down Expand Up @@ -930,6 +938,26 @@ mod tests {
assert!(opts.subscription_source_conn.is_none());
}

#[test]
fn online_options_default_auto_create_publication_and_drop_slot() {
let opts = OnlineOptions::default();
assert!(opts.auto_create_publication);
assert!(opts.drop_slot_on_cutover);
}

#[test]
fn serde_backwards_compat_missing_auto_create_and_drop_slot_defaults_true() {
let opts = OnlineOptions::default();
let mut json: serde_json::Value = serde_json::to_value(&opts).unwrap();
json.as_object_mut()
.unwrap()
.remove("auto_create_publication");
json.as_object_mut().unwrap().remove("drop_slot_on_cutover");
let opts2: OnlineOptions = serde_json::from_value(json).unwrap();
assert!(opts2.auto_create_publication);
assert!(opts2.drop_slot_on_cutover);
}

#[test]
fn validate_accepts_valid_online_config() {
let cfg = MigrationConfig {
Expand Down
Loading
Loading