percona-docstreamer/config.yaml at main · Percona-Lab/percona-docstreamer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# -----------------------------------------------
# docStreamer Configuration File
# -----------------------------------------------
# This file is watched by the application at runtime.
# Changes will be logged, and new settings will be
# picked up by new processes.
#
# Environment variables will OVERRIDE these settings.
# e.g., export DOCDB_ENDPOINT=other.host
# -----------------------------------------------

# -----------------------------------------------
# Logging Configuration
# -----------------------------------------------
logging:
  # Log level (e.g., "debug", "info", "warn", "error")
  level: "info"

  # Path to the log file.
  # The directory will be created if it doesn't exist.
  file_path: "logs/docStreamer.log"

  # Path to CDC ops log
  ops_log_path: "logs/cdc.log"

  # Path to full load ops log
  full_load_log_path: "logs/full_load.log"

  # validator log
  validator_log_path: "logs/validator.log"

# -----------------------------------------------
# Source DocumentDB
# -----------------------------------------------
docdb:
  endpoint: "localhost"
  port: "7777"
  tls: true
  ca_file: "/home/daniel.almeida/global-bundle.pem"
  # If true, tlsAllowInvalidHostnames=true will be added to the connection string.
  tls_allow_invalid_hostnames: true
  # extra_params: "directConnection=true&replicaSet=rsName"

# -----------------------------------------------
# Target MongoDB
# -----------------------------------------------
mongo:
  endpoint: "dan-ps-lab-mongos00.tp.int.percona.com,dan-ps-lab-mongos01.tp.int.percona.com"
  port: "27017"
  tls: false
  ca_file: ""
  tls_allow_invalid_hostnames: true
  # extra_params: "directConnection=true&replicaSet=rsName"
  extra_params: "wtimeoutMS=120000&socketTimeoutMS=180000"

# -----------------------------------------------
# General Migration Settings
# -----------------------------------------------
migration:
  # 80% of available RAM recommended
  go_mem_limit: "80%"
  # Lower values increase GC frequency (reduces RAM usage)
  go_gc: 50
  # Prefix for environment variables (e.g., MIGRATION_DOCDB_USER)
  env_prefix: "MIGRATION"
  pid_file_path: "docStreamer.pid"
  # Port for the HTTP /status endpoint
  status_http_port: "8080"
  # network_compressors: "zlib,snappy"
  network_compressors: ""

  # Databases to skip during discovery
  exclude_dbs:
    - "admin"
    - "local"
    - "config"

  # Collections to skip during discovery (format: "dbname.collname")
  # If you want to skip all collections for a given database, use exclude_dbs setting instead
  exclude_collections: []

  # max_concurrent_workers: This is the max number of collections to copy at the same time
  # Controls how many collections are migrated simultaneously during the full load stage
  # Example: If you have 50 collections and set this to 2, docStreamer will migrate 2 collections at a time.
  # As soon as one finishes, the next one starts.
  max_concurrent_workers: 2

  # Database and collections to keep track of the migration
  metadata_db: "docStreamer"
  checkpoint_collection: "checkpoints"
  checkpoint_doc_id: "cdc_resume_timestamp"
  status_collection: "status"
  status_doc_id: "migration_status"
  validation_stats_collection: "validation_stats"
  validation_failures_collection: "validation_failures"
  validation_audit_collection: "validation_audit"

  # Timeout for collection discovery and stats (in milliseconds)
  # Default: 120000 (2 minutes) to handle large collections on DocumentDB
  discovery_timeout_ms: 120000

  # Set the following to True if you want to start the migration all over from scratch
  # This will drop all databases and collections in the destination environment
  # except for admin, local and config
  destroy: False

  # Set the following to True if you do not want to make any changes and just want to validate you are able to connect
  # to source and destination
  dry_run: False

# -----------------------------------------------
# Online Data Validation
# -----------------------------------------------
validation:
  # Enabled: If true, every batch written by CDC is immediately queued for verification.
  # This ensures data consistency in real-time. You want to enable this, otherwise, you will have to validate source and destination on your own
  enabled: True

  # --------------------------------------------------------------------------
  # full_validation: Controls the scope of CDC data validation.
  #   - true:  Validates all operations (Inserts, Upserts, Updates, and Deletes).
  #   - false: (Default) Validates ONLY Deletes. This is a massive performance optimization
  #            since Inserts and Updates are guaranteed by the source change stream
  #            payload, whereas Deletes lose their full document payload.
  #
  # * Note: This setting is hot-reloadable! You can change this from true to false
  #         while the app is running and the validator will adjust immediately
  #         without requiring a restart.
  # --------------------------------------------------------------------------
  full_validation: false

  # Batch Size: How many documents to validate in a single lookup operation.
  # Larger batches reduce network round-trips but increase memory usage.
  # Default: 100
  batch_size: 500

  # Retry Interval (ms): If a record fails validation because it is being written to (Hot Key),
  # the validator will wait this many milliseconds before checking it again.

  # Default: 500ms
  retry_interval_ms: 500

  # Max Validation Workers: How many concurrent threads to use for verifying data.
  # Increase this to speed up validation if you have spare CPU/Network capacity.
  # Default: 4
  max_validation_workers: 16

  # Max Retries: How many times to retry validation for a "Hot Key" (actively modifying record)
  # before giving up and marking it as skipped/mismatched.
  # Default: 3
  max_retries: 3

  # The maximum time (in minutes) a failed or hot key will wait in the queue.
  # Even if the system is busy, the sweeper will FORCE a re-check at this interval
  # to ensure nothing gets stuck forever.
  # Default: 5
  hot_key_check_interval_minutes: 5

  # How often (in seconds) the system checks if the CDC is "Idle".
  # If the CDC is idle (active lag is 0 and no events for >5s), the sweeper
  # will run IMMEDIATELY to fix pending keys, ignoring the hot_key_check_interval_minutes timer.
  #
  #   - Lower values (e.g., 2): Faster repairs during quiet periods (more CPU).
  #   - Higher values (e.g., 10): Less CPU usage, slightly slower reaction to idle time.
  #
  # Default: 5
  idle_check_interval_seconds: 5

  # Queue Size: The buffer size for the validation channel.
  # If the CDC writer is faster than the validator, this buffer fills up.
  # If full, CDC will throttle validation requests to avoid slowing down replication.
  # When throttled, CDC will also be throttled to avoid its cache from filling up
  # You should increase this value for better validation throughput and avoid CDC slowing down because the validation is too slow
  # If you notice the queue size increasing, this means it is either set too low or you don't have enough workers or the
  # destination is overwhelmed and can't validate records fast enough, so they are added to the queue
  # Important: If CDC batch size is large and the documents are also large, high values could cause high memory usage
  # e.g. 1000 cdc batches * 2000 queued batches = 2,000,000 docs in memory
  # Default: 2000 (lower than 100 will revert to default 2000)
  queue_size: 2000

# -----------------------------------------------
# Adaptive Flow Control (Throttling)
# -----------------------------------------------
flow_control:
  # Enabled: If true, docStreamer continuously monitors the target MongoDB's health.
  # It polls `db.serverStatus()` on the target (and all shards if discovered) every second.
  # If any node shows signs of overload (high queues or memory usage),
  # docStreamer will temporarily pause fetching new data from the source.
  enabled: true

  # Check Interval: How often (in milliseconds) to poll the target database for its status.
  # Default: 1000 (1 second)
  check_interval_ms: 1000

  # Target Max Queued Ops: The safety limit for the Target's Global Lock Queue.
  # Source Metric: db.serverStatus().globalLock.currentQueue.total
  # Checks BOTH Mongos and all backend Shards.
  # If ANY node has more than this many operations queued, docStreamer pauses.
  # Default: 50
  target_max_queued_ops: 50

  # Pause Duration: How long (in milliseconds) to sleep when an overload is detected.
  # The application will re-check the status after this duration.
  # Default: 5000
  pause_duration_ms: 5000

  # The maximum acceptable latency (in milliseconds) for a 'serverStatus' command
  # on the target MongoDB. If the target takes longer than this to respond,
  # docStreamer will pause CDC and Full Load writes to allow the database to recover.
  # This is typically a good indicator of a stressed backend server, since this command should not take this long to return a response
  # Default: 250
  latency_threshold_ms: 250

  # active_client_threshold: The maximum number of concurrent active clients allowed on the target MongoDB
  # before throttling occurs. This helps prevent connection storms and CPU saturation during heavy loads.
  # TUNING GUIDANCE:
  # This value acts as the "Speed Limit" for the migration. It measures the TOTAL active clients on the target DB,
  # which includes:
  #   1. The 'max_write_workers' from this application.
  #   2. Internal background threads (validation, monitoring).
  #   3. Any other external applications connected to the target.
  # Ensure this limit is high enough to accommodate your 'max_write_workers' plus a buffer for other traffic.
  # Default: 50
  active_client_threshold: 50

  # Min WiredTiger Tickets: The minimum number of available write tickets required on the target.
  # Source Metric: db.serverStatus().wiredTiger.concurrentTransactions.write.available
  # If the available write tickets on ANY node drop to or below this value, docStreamer will pause.
  # Default: 0 (Disabled)
  min_wired_tiger_tickets: 1

# -----------------------------------------------
# Sharding Setting
# -----------------------------------------------
sharding:
  # Example 1: Compound Range Sharding (Default behavior)
  # Shards on 'email' (ascending) and 'rental_id' (ascending)
    - namespace: "lukRange_1.test_1"
      shard_key: "email, rental_id"

  # Example 2: Custom compound range sharding with pre-splitting
  # testId is UUID datatype and testOid is an Object_ID datatype
    - namespace: "lukHash_1.test_2"
      shard_key: "testId:1, testOid:1"
      unique_shard_key: False # Not required, defaults to False
      pre_split_strategy: "composite_uuid_oid"
      uuid_field: "testId"
      oid_field: "testOid"

  # Example 3: Hashed Sharding
  # Shards on testId ('uuid' datatype) using a Hashed Index
    - namespace: "lukHash_1.test_1"
      shard_key: "testId:hashed"
      pre_split_strategy: "hashed_manual"
      pre_split_chunk_count: 10 # This will create 10 evenly spaced chunks in the hash ring

  # Example 4: Compound Hashed Sharding
  # Shards on 'region' (Range) and 'user_id' (Hashed)
    - namespace: "compoundShard.test"
      shard_key: "region:1, user_id:hashed"

  # Example 5: Range Sharding
  # Shards on 'entryTimestamp' using a range sharding
    - namespace: "singleFieldShard.test"
      shard_key: "entryTimestamp"
      pre_split_strategy: "range_manual"
      pre_split_chunk_count: 10
      split_min: "1700000000000"  # Your start timestamp
      split_max: "2200000000000"  # Your end timestamp
      split_type: "int64"         # Adjust depending on the datatype, example for int64


# -----------------------------------------------
# Full Load Settings
# -----------------------------------------------
cloner:
  # num_read_workers: Controls how many threads are used to read data for a single collection
  # This is the number of parallel readers (read workers) fetching from DocumentDB for one collection
  num_read_workers: 4

  # num_insert_workers: Controls how many threads are used to write data for a single collection
  # This is the number of parallel writers (insert workers) pushing to MongoDB for one given collection
  # Increase writers to achieve 1:8 ratio with readers, this typically provides an effective performance
  # however it is important to keep in mind this is per collection, so tune this wisely (see max_concurrent_workers)
  # e.g. If num_read_workers = 4 then set num_insert_workers to 32 (e.g. 4 * 8 = 32)
  # CRITICAL: Reduce insert workers to limit concurrent writes.
  # High values here can OOM a shard by flooding it with connections/data.
  # Recommendation: 4-8 per collection
  num_insert_workers: 4

  # By deferring secondary indexes (which are heavy to maintain during high-velocity inserts), you can significantly improve the write throughput of the full load phase.
  # Secondary indexes will be created after full load phase has finished
  # Default: False
  postpone_index_creation: False

  # read_batch_size: Number of documents per read batch
  read_batch_size: 1000

  # Reduce batch size to lower memory pressure per request on the cluster.
  # insert_batch_size: Number of documents per insert batch
  insert_batch_size: 1000

  # insert_batch_bytes: Max size (in bytes) of a single insert batch
  insert_batch_bytes: 50331648 # 48MB

  # segment_size_docs: Size (in docs) of a segment for parallel reads
  # A collection of 1M docs will be split into 100 segments of 10k docs
  segment_size_docs: 10000

  # num_retries: How many times to retry failed operations (reads/writes)
  # Default: 5
  num_retries: 5

  # retry_interval_ms: How long to wait (in ms) between retries
  # Default: 1000
  retry_interval_ms: 1000

  # write_timeout_ms: Max time (in ms) for a single bulk write batch to complete
  # Default: 30000
  write_timeout_ms: 300000

# -----------------------------------------------
# Change Data Capture (CDC) Settings
# -----------------------------------------------
cdc:
  # batch_size: How many operations to batch together
  batch_size: 50

  # batch_interval_ms: Max time to wait (in ms) before flushing a batch
  # This ensures low-volume changes are still applied quickly
  batch_interval_ms: 500

  # max_await_time_ms: Max time (in ms) for the change stream to wait for new events
  max_await_time_ms: 1000

  # max_write_workers: Controls concurrency during the cdc phase.
  # After the full load finishes, this setting controls how many threads apply the stream of real-time changes.
  # This determines how "wide" your write pipeline is during the live cdc phase. A higher number allows for more
  # parallelism when replaying real-time events.
  #
  # Resource Usage: Setting this too high can saturate the connections or CPU on your target MongoDB cluster.
  #
  # TUNING GUIDANCE:
  # This value acts as the "Gas Pedal" for the migration. It must be configured lower than the
  # 'flow_control.active_client_threshold' setting. If max_write_workers is set higher than or equal to
  # that threshold, the application will detect its own heavy load as "congestion" and pause itself unnecessarily.
  # Recommendation: Set this to 50-75% of 'active_client_threshold' (e.g., if threshold is 50, set this to 25-30).
  # Default: 8
  max_write_workers: 8

  # num_retries: How many times to retry a failed batch (due to network/connection issues)
  # before giving up and stopping the migration.
  # Default: 10
  num_retries: 10

  # retry_interval_ms: How long to wait (in milliseconds) between retry attempts.
  # Default: 1000 (1 second)
  retry_interval_ms: 1000

  # write_timeout_ms: The maximum time to wait for a BulkWrite operation to complete.
  # If the network hangs, this ensures the worker doesn't freeze forever.
  # Default: 30000 (30 seconds)
  write_timeout_ms: 300000