-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Improve replication since_seq parameter
#5881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,7 +73,7 @@ | |
|
|
||
| % Worker children get a default 5 second shutdown timeout, so pick a value just | ||
| % a bit less than that: 4.5 seconds. In couch_replicator_sup our scheduler | ||
| % worker doesn't specify the timeout, so it up picks ups the OTP default of 5 | ||
| % worker doesn't specify the timeout, so it picks ups the OTP default of 5 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/ups/up ? |
||
| % seconds https://www.erlang.org/doc/system/sup_princ.html#child-specification | ||
| % | ||
| -define(TERMINATE_SHUTDOWN_TIME, 4500). | ||
|
|
@@ -173,9 +173,9 @@ job_proxy_url(_Endpoint) -> | |
| null. | ||
|
|
||
| % Health threshold is the minimum amount of time an unhealthy job should run | ||
| % crashing before it is considered to be healthy again. HealtThreashold should | ||
| % crashing before it is considered to be healthy again. Health threshold should | ||
| % not be 0 as jobs could start and immediately crash, and it shouldn't be | ||
| % infinity, since then consecutive crashes would accumulate forever even if | ||
| % infinity, since then consecutive crashes would accumulate forever even if | ||
| % job is back to normal. | ||
| -spec health_threshold() -> non_neg_integer(). | ||
| health_threshold() -> | ||
|
|
@@ -522,7 +522,7 @@ pending_fold(Job, {Set, Now, Count, HealthThreshold}) -> | |
|
|
||
| % Replace Job in the accumulator if it has a higher priority (lower priority | ||
| % value) than the lowest priority there. Job priority is indexed by | ||
| % {FairSharePiority, LastStarted} tuples. If the FairSharePriority is the same | ||
| % {FairSharePriority, LastStarted} tuples. If the FairSharePriority is the same | ||
| % then last started timestamp is used to pick. The goal is to keep up to Count | ||
| % oldest jobs during the iteration. For example, if there are jobs with these | ||
| % priorities accumulated so far [5, 7, 11], and the priority of current job is | ||
|
|
@@ -594,14 +594,13 @@ not_recently_crashed(#job{history = History}, Now, HealthThreshold) -> | |
| % and running successfully without crashing for a period of time. That period | ||
| % of time is the HealthThreshold. | ||
| % | ||
|
|
||
| -spec consecutive_crashes(history(), non_neg_integer()) -> non_neg_integer(). | ||
| consecutive_crashes(History, HealthThreshold) when is_list(History) -> | ||
| consecutive_crashes(History, HealthThreshold, 0). | ||
|
|
||
| -spec consecutive_crashes(history(), non_neg_integer(), non_neg_integer()) -> | ||
| non_neg_integer(). | ||
| consecutive_crashes([], _HealthThreashold, Count) -> | ||
| consecutive_crashes([], _HealthThreshold, Count) -> | ||
| Count; | ||
| consecutive_crashes( | ||
| [{{crashed, _}, CrashT}, {_, PrevT} = PrevEvent | Rest], | ||
|
|
@@ -795,7 +794,7 @@ rotate_jobs(State, ChurnSoFar) -> | |
| if | ||
| SlotsAvailable >= 0 -> | ||
| % If there is are enough SlotsAvailable reduce StopCount to avoid | ||
| % unnesessarily stopping jobs. `stop_jobs/3` ignores 0 or negative | ||
| % unnecessarily stopping jobs. `stop_jobs/3` ignores 0 or negative | ||
| % values so we don't worry about that here. | ||
| StopCount = lists:min([Pending - SlotsAvailable, Running, Churn]), | ||
| stop_jobs(StopCount, true, State), | ||
|
|
@@ -930,7 +929,7 @@ optimize_int_option({Key, Val}, #rep{options = Options} = Rep) -> | |
| % Updater is a separate process. It receives `update_stats` messages and | ||
| % updates scheduler stats from the scheduler jobs table. Updates are | ||
| % performed no more frequently than once per ?STATS_UPDATE_WAIT milliseconds. | ||
|
|
||
| % | ||
| update_running_jobs_stats(StatsPid) when is_pid(StatsPid) -> | ||
| StatsPid ! update_stats, | ||
| ok. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,7 +110,7 @@ stop(Pid) when is_pid(Pid) -> | |
| % In the rare case the job is already stopping as we try to stop it, it | ||
| % won't return ok but exit the calling process, usually the scheduler, so | ||
| % we guard against that. See: | ||
| % www.erlang.org/doc/apps/stdlib/gen_server.html#stop/3 | ||
| % https://www.erlang.org/doc/apps/stdlib/gen_server.html#stop/3 | ||
| catch gen_server:stop(Pid, shutdown, ?STOP_TIMEOUT_MSEC), | ||
| exit(Pid, kill), | ||
| receive | ||
|
|
@@ -189,7 +189,7 @@ do_init(#rep{options = Options, id = {BaseId, Ext}, user_ctx = UserCtx} = Rep) - | |
| % Restarting a temporary supervised child implies that the original arguments | ||
| % (#rep{} record) specified in the MFA component of the supervisor | ||
| % child spec will always be used whenever the child is restarted. | ||
| % This implies the same replication performance tunning parameters will | ||
| % This implies the same replication performance tuning parameters will | ||
| % always be used. The solution is to delete the child spec (see | ||
| % cancel_replication/1) and then start the replication again, but this is | ||
| % unfortunately not immune to race conditions. | ||
|
|
@@ -685,7 +685,12 @@ init_state(Rep) -> | |
| Stats = couch_replicator_stats:max_stats(ArgStats1, HistoryStats), | ||
|
|
||
| StartSeq1 = get_value(since_seq, Options, StartSeq0), | ||
| StartSeq = {0, StartSeq1}, | ||
| StartSeq2 = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't in general compare sequences. But we can compare with 0 if we're starting from scratch. We should use == 0. Based on the discussion in #5867 we'd want something like this: If there is a checkpoint (StartSeq0 =/= 0) use the checkpoint and ignore since_seq. Otherwise (no checkpoint) and user specified a since_seq, use the since_seq In addition make sure the since_seq is folded into the replication ID (append to the list if it's there but if it isn't make sure to generate the same replication ID as before). This will rewind changes for a existing replications with since_seq back to 0. So it's something to warn the users about in the release notes. |
||
| case StartSeq0 > StartSeq1 of | ||
| true -> StartSeq0; | ||
| false -> StartSeq1 | ||
| end, | ||
| StartSeq = {0, StartSeq2}, | ||
|
|
||
| SourceSeq = get_value(<<"update_seq">>, SourceInfo, ?LOWEST_SEQ), | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please make the many (and welcome!) documentation fixes in a separate PR to the one making the since_seq enhancement.