diff --git a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb index 497ee7bd..04b54755 100644 --- a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb +++ b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb @@ -145,9 +145,21 @@ def before_perform end def on_start - count = @task.count + @run.start(safe_count) + end + + def safe_count + count = CountTimeout.with_timeout(MaintenanceTasks.count_timeout_ms) { @task.count } count = @collection_enum.size if count == NO_COUNT_DEFINED - @run.start(count) + count + rescue ActiveRecord::QueryCanceled => error + Rails.error.report( + error, + handled: true, + severity: :warning, + context: { task: @task.class.name, timeout_ms: MaintenanceTasks.count_timeout_ms }, + ) + nil end def on_shutdown diff --git a/lib/maintenance_tasks.rb b/lib/maintenance_tasks.rb index 21a58b7f..732be863 100644 --- a/lib/maintenance_tasks.rb +++ b/lib/maintenance_tasks.rb @@ -7,6 +7,7 @@ require "job-iteration" require "maintenance_tasks/engine" +require "maintenance_tasks/count_timeout" # The engine's namespace module. It provides isolation between the host # application's code and the engine-specific code. Top-level engine constants @@ -150,6 +151,23 @@ module MaintenanceTasks NO_COUNT_DEFINED.freeze private_constant :NO_COUNT_DEFINED + # @!attribute count_timeout_ms + # @scope class + # Server-side statement timeout, in milliseconds, applied while computing + # a Task's count for progress tracking. The count query populates the + # progress bar and is not required for task execution, so a slow count + # must never block or fail a run. + # + # Implemented via the database's native cancellation mechanism + # (`statement_timeout` on PostgreSQL, `max_execution_time` on MySQL / + # Trilogy). Adapters without a native equivalent (e.g. SQLite) ignore the + # setting. See {CountTimeout}. + # + # Set to `nil` or `0` to disable. + # + # @return [Integer, nil] the timeout in milliseconds. + mattr_accessor :count_timeout_ms, default: 5_000 + class << self DEPRECATION_MESSAGE = "MaintenanceTasks.error_handler is deprecated and will be removed in the 3.0 release. " \ "Instead, reports will be sent to the Rails error reporter. Do not set a handler and subscribe " \ diff --git a/lib/maintenance_tasks/count_timeout.rb b/lib/maintenance_tasks/count_timeout.rb new file mode 100644 index 00000000..a67baa89 --- /dev/null +++ b/lib/maintenance_tasks/count_timeout.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +module MaintenanceTasks + # Wraps a block in a database-level statement timeout so a slow query is + # cancelled by the server rather than left running indefinitely. + # + # Used by {TaskJobConcern} to bound the {Task#count} query that populates + # the progress bar — a count failure must not stall or fail a task run. + # + # The mechanism is adapter-specific: + # + # * **PostgreSQL** — `SET LOCAL statement_timeout` inside + # `transaction(requires_new: true)`; automatically reset on commit or + # rollback. When invoked inside an existing transaction (e.g. test + # fixtures), AR's `requires_new` opens a savepoint rather than a real + # transaction; the timeout is still enforced for the duration of the + # block, but the `SET LOCAL` setting is only cleared when the outer + # transaction ends. {TaskJobConcern#on_start} is not wrapped in a + # transaction in production, so this is invisible there. + # * **MySQL / Trilogy** — `SET SESSION max_execution_time`, with the prior + # value restored in an `ensure` block. + # * **SQLite / unknown adapters** — no-op; the block runs without a timeout. + # + # On timeout the database raises {ActiveRecord::QueryCanceled} (or its + # subclass {ActiveRecord::StatementTimeout}). Callers are expected to + # rescue and treat the result as unknown. + # + # @example + # MaintenanceTasks::CountTimeout.with_timeout(5_000) do + # User.where(active: true).count + # end + module CountTimeout + extend self + + # Run +block+ with a server-side statement timeout of +timeout_ms+ + # milliseconds applied to +connection+. + # + # When +timeout_ms+ is +nil+, +0+, or negative the block runs unmodified — + # callers can pass the configured value directly without a guard. + # + # @param timeout_ms [Integer, nil] the timeout in milliseconds. + # @param connection [ActiveRecord::ConnectionAdapters::AbstractAdapter] + # the connection to apply the timeout to. Defaults to + # `ActiveRecord::Base.connection`. + # @yield the block to execute under the timeout. + # @return the value returned by the block. + # @raise [ActiveRecord::QueryCanceled] if the database cancels a query. + def with_timeout(timeout_ms, connection: ActiveRecord::Base.connection, &block) + return yield if timeout_ms.nil? || timeout_ms <= 0 + + case connection.adapter_name + when "PostgreSQL", "PostGIS" + with_postgresql_timeout(connection, timeout_ms.to_i, &block) + when "Mysql2", "Trilogy" + with_mysql_timeout(connection, timeout_ms.to_i, &block) + else + yield + end + end + + private + + def with_postgresql_timeout(connection, timeout_ms, &block) + connection.transaction(requires_new: true) do + connection.execute("SET LOCAL statement_timeout = #{timeout_ms}") + block.call + end + end + + def with_mysql_timeout(connection, timeout_ms) + previous = connection.select_value("SELECT @@SESSION.max_execution_time").to_i + connection.execute("SET SESSION max_execution_time = #{timeout_ms}") + begin + yield + ensure + connection.execute("SET SESSION max_execution_time = #{previous}") + end + end + end +end diff --git a/test/jobs/maintenance_tasks/task_job_test.rb b/test/jobs/maintenance_tasks/task_job_test.rb index c9a0ce2c..84dae45e 100644 --- a/test/jobs/maintenance_tasks/task_job_test.rb +++ b/test/jobs/maintenance_tasks/task_job_test.rb @@ -806,6 +806,32 @@ class << self assert_equal 2, @run.reload.tick_total end + test "a count query that exceeds the timeout is rescued and the run starts with no tick_total" do + Maintenance::TestTask.any_instance.stubs(:count).raises( + ActiveRecord::QueryCanceled.new("canceling statement due to statement timeout"), + ) + Rails.error.expects(:report).with do |error, **kwargs| + error.is_a?(ActiveRecord::QueryCanceled) && + kwargs[:handled] == true && + kwargs[:severity] == :warning && + kwargs[:context][:task] == "Maintenance::TestTask" + end + + TaskJob.perform_now(@run) + + assert_nil @run.reload.tick_total + end + + test "@task.count is wrapped in CountTimeout with the configured timeout" do + original = MaintenanceTasks.count_timeout_ms + MaintenanceTasks.count_timeout_ms = 1234 + CountTimeout.expects(:with_timeout).with(1234).yields.returns(MaintenanceTasks.const_get(:NO_COUNT_DEFINED)) + + TaskJob.perform_now(@run) + ensure + MaintenanceTasks.count_timeout_ms = original + end + test ".perform_now accepts custom enumerated tasks" do run = Run.create!(task_name: "Maintenance::CustomEnumeratingTask") diff --git a/test/lib/maintenance_tasks/count_timeout_test.rb b/test/lib/maintenance_tasks/count_timeout_test.rb new file mode 100644 index 00000000..e73a1774 --- /dev/null +++ b/test/lib/maintenance_tasks/count_timeout_test.rb @@ -0,0 +1,178 @@ +# frozen_string_literal: true + +require "test_helper" + +module MaintenanceTasks + class CountTimeoutTest < ActiveSupport::TestCase + test ".with_timeout yields without modifying the connection when timeout_ms is nil" do + connection = mock("connection") + connection.expects(:adapter_name).never + + result = CountTimeout.with_timeout(nil, connection: connection) { :ok } + + assert_equal :ok, result + end + + test ".with_timeout yields without modifying the connection when timeout_ms is zero" do + connection = mock("connection") + connection.expects(:adapter_name).never + + assert_equal :ok, CountTimeout.with_timeout(0, connection: connection) { :ok } + end + + test ".with_timeout yields without modifying the connection when timeout_ms is negative" do + connection = mock("connection") + connection.expects(:adapter_name).never + + assert_equal :ok, CountTimeout.with_timeout(-1, connection: connection) { :ok } + end + + test ".with_timeout is a no-op on SQLite" do + connection = mock("connection") + connection.stubs(adapter_name: "SQLite") + connection.expects(:execute).never + connection.expects(:transaction).never + + assert_equal :ok, CountTimeout.with_timeout(5_000, connection: connection) { :ok } + end + + test ".with_timeout is a no-op for unknown adapters" do + connection = mock("connection") + connection.stubs(adapter_name: "SomeFutureAdapter") + connection.expects(:execute).never + connection.expects(:transaction).never + + assert_equal :ok, CountTimeout.with_timeout(5_000, connection: connection) { :ok } + end + + test ".with_timeout sets statement_timeout inside a transaction on PostgreSQL" do + connection = mock("connection") + connection.stubs(adapter_name: "PostgreSQL") + + transaction_seen = false + connection.expects(:transaction).with(requires_new: true).yields.returns(:txn_result) + connection.expects(:execute).with("SET LOCAL statement_timeout = 5000") + + result = CountTimeout.with_timeout(5_000, connection: connection) do + transaction_seen = true + :inner + end + + assert transaction_seen, "block should have been invoked" + assert_equal :txn_result, result + end + + test ".with_timeout works on PostGIS" do + connection = mock("connection") + connection.stubs(adapter_name: "PostGIS") + connection.expects(:transaction).with(requires_new: true).yields.returns(:done) + connection.expects(:execute).with("SET LOCAL statement_timeout = 1234") + + assert_equal :done, CountTimeout.with_timeout(1_234, connection: connection) { :ignored } + end + + test ".with_timeout sets and restores max_execution_time on Mysql2" do + connection = mock("connection") + connection.stubs(adapter_name: "Mysql2") + + seq = sequence("mysql") + connection.expects(:select_value).with("SELECT @@SESSION.max_execution_time").returns("0").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 5000").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 0").in_sequence(seq) + + assert_equal :ok, CountTimeout.with_timeout(5_000, connection: connection) { :ok } + end + + test ".with_timeout works on Trilogy" do + connection = mock("connection") + connection.stubs(adapter_name: "Trilogy") + + seq = sequence("trilogy") + connection.expects(:select_value).returns("250").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 5000").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 250").in_sequence(seq) + + CountTimeout.with_timeout(5_000, connection: connection) { :ok } + end + + test ".with_timeout restores max_execution_time on MySQL even if the block raises" do + connection = mock("connection") + connection.stubs(adapter_name: "Mysql2") + + seq = sequence("mysql") + connection.expects(:select_value).returns("100").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 5000").in_sequence(seq) + connection.expects(:execute).with("SET SESSION max_execution_time = 100").in_sequence(seq) + + assert_raises(RuntimeError) do + CountTimeout.with_timeout(5_000, connection: connection) { raise "boom" } + end + end + + test ".with_timeout lets ActiveRecord::QueryCanceled propagate" do + connection = mock("connection") + connection.stubs(adapter_name: "Mysql2") + connection.stubs(:select_value).returns("0") + connection.stubs(:execute) + + assert_raises(ActiveRecord::QueryCanceled) do + CountTimeout.with_timeout(5_000, connection: connection) do + raise ActiveRecord::QueryCanceled, "timeout" + end + end + end + + class PostgreSQLIntegrationTest < ActiveSupport::TestCase + # `SET LOCAL` is transaction-scoped. With transactional fixtures the test + # already runs inside an outer transaction, so the helper's + # `transaction(requires_new: true)` opens a savepoint and the setting + # leaks past the helper's block. Disable fixture wrapping so we exercise + # the same code path the production job runs in. + self.use_transactional_tests = false + + setup do + skip "requires PostgreSQL" unless ["PostgreSQL", "PostGIS"].include?( + ActiveRecord::Base.connection.adapter_name, + ) + end + + test "a query exceeding the timeout raises ActiveRecord::QueryCanceled" do + assert_raises(ActiveRecord::QueryCanceled) do + CountTimeout.with_timeout(50) do + ActiveRecord::Base.connection.select_value("SELECT pg_sleep(2)") + end + end + end + + test "a fast query completes normally under the timeout" do + result = CountTimeout.with_timeout(5_000) do + ActiveRecord::Base.connection.select_value("SELECT 1") + end + + assert_equal 1, result + end + + test "statement_timeout is reset after the block returns" do + CountTimeout.with_timeout(50) do + ActiveRecord::Base.connection.select_value("SELECT 1") + end + + timeout = ActiveRecord::Base.connection.select_value("SHOW statement_timeout") + + assert_equal "0", timeout + end + + test "statement_timeout is reset after the block raises" do + assert_raises(ActiveRecord::QueryCanceled) do + CountTimeout.with_timeout(50) do + ActiveRecord::Base.connection.select_value("SELECT pg_sleep(2)") + end + end + + timeout = ActiveRecord::Base.connection.select_value("SHOW statement_timeout") + + assert_equal "0", timeout + end + end + end +end