Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/jobs/runtime/events_cleanup.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def initialize(cutoff_age_in_days)
end

def perform
Database::OldRecordCleanup.new(Event, cutoff_age_in_days).delete
Database::OldRecordCleanup.new(Event, cutoff_age_in_days:).delete
end

def job_name_in_configuration
Expand Down
17 changes: 17 additions & 0 deletions app/models/runtime/app_usage_event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,23 @@ class AppUsageEvent < Sequel::Model
:buildpack_guid, :buildpack_name,
:package_state, :previous_package_state, :parent_app_guid,
:parent_app_name, :process_type, :task_name, :task_guid

def self.usage_lifecycles
[
{
beginning_states: [ProcessModel::STARTED, Repositories::AppUsageEventRepository::WAS_RUNNING_EVENT_STATE],
ending_state: ProcessModel::STOPPED,
guid_column: :app_guid
},
{
beginning_states: [Repositories::AppUsageEventRepository::TASK_STARTED_EVENT_STATE,
Repositories::AppUsageEventRepository::TASK_WAS_RUNNING_EVENT_STATE],
ending_state: Repositories::AppUsageEventRepository::TASK_STOPPED_EVENT_STATE,
guid_column: :task_guid
}
].freeze
end

AppUsageEvent.dataset_module do
def supports_window_functions?
false
Expand Down
24 changes: 20 additions & 4 deletions app/models/runtime/task_model.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def after_update

def after_destroy
super
create_stop_event unless terminal_state?
create_stop_event_if_needed unless terminal_state?
end

def run_action_user
Expand Down Expand Up @@ -137,9 +137,25 @@ def create_start_event
def create_stop_event_if_needed
app_usage_repo = Repositories::AppUsageEventRepository.new

start_event = app_usage_repo.find_by_task_and_state(task: self, state: 'TASK_STARTED')
existing_stop_event = app_usage_repo.find_by_task_and_state(task: self, state: 'TASK_STOPPED')
return if start_event.nil? || existing_stop_event.present?
return if app_usage_repo.find_by_task_and_state(task: self, state: Repositories::AppUsageEventRepository::TASK_STOPPED_EVENT_STATE).present?

# Record the stop only when there is recorded evidence that the task
# started: the TASK_STARTED event, or the TASK_WAS_RUNNING baseline seeded
# for tasks that were already running when the keep-running cleanup was
# introduced. Without either, no consumer ever saw the task start, so a
# stop event would be unmatched noise.
#
# NOTE: on MySQL (default REPEATABLE READ) these must be the first reads
# in the surrounding transaction. MySQL freezes what a transaction can
# see at its first read; if an earlier hook ran a query first, a baseline
# committed in the meantime would be invisible here, and the stop would
# be wrongly skipped.
start_evidence_states = [
Repositories::AppUsageEventRepository::TASK_STARTED_EVENT_STATE,
Repositories::AppUsageEventRepository::TASK_WAS_RUNNING_EVENT_STATE
]
started = app_usage_repo.find_by_task_and_state(task: self, state: start_evidence_states)
return if started.nil?

create_stop_event
end
Expand Down
12 changes: 12 additions & 0 deletions app/models/services/service_usage_event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,17 @@ class ServiceUsageEvent < Sequel::Model
:service_plan_guid, :service_plan_name,
:service_guid, :service_label,
:service_broker_name, :service_broker_guid

def self.usage_lifecycles
[
{
beginning_states: [Repositories::ServiceUsageEventRepository::CREATED_EVENT_STATE,
Repositories::ServiceUsageEventRepository::UPDATED_EVENT_STATE,
Repositories::ServiceUsageEventRepository::WAS_RUNNING_EVENT_STATE],
ending_state: Repositories::ServiceUsageEventRepository::DELETED_EVENT_STATE,
guid_column: :service_instance_guid
}
].freeze
end
end
end
14 changes: 13 additions & 1 deletion app/repositories/app_usage_event_repository.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@
module VCAP::CloudController
module Repositories
class AppUsageEventRepository
WAS_RUNNING_EVENT_STATE = 'WAS_RUNNING'.freeze
TASK_STARTED_EVENT_STATE = 'TASK_STARTED'.freeze
TASK_STOPPED_EVENT_STATE = 'TASK_STOPPED'.freeze
# Task baselines get their own state (rather than reusing WAS_RUNNING)
# because task events share the app_usage_events table with app events but
# carry an empty app_guid. If task baselines said WAS_RUNNING, the cleanup
# and the backfill's repair would both treat every task baseline as
# belonging to a single app whose guid is '' -- the cleanup would wrongly
# prune them, and the repair would write bogus STOPPED events for that
# phantom app.
TASK_WAS_RUNNING_EVENT_STATE = 'TASK_WAS_RUNNING'.freeze

def find(guid)
AppUsageEvent.find(guid:)
end
Expand Down Expand Up @@ -152,7 +164,7 @@ def purge_and_reseed_started_apps!
end

def delete_events_older_than(cutoff_age_in_days)
Database::OldRecordCleanup.new(AppUsageEvent, cutoff_age_in_days, keep_at_least_one_record: true).delete
Database::OldRecordCleanup.new(AppUsageEvent, cutoff_age_in_days: cutoff_age_in_days, keep_at_least_one_record: true, keep_running_records: true).delete
end

private
Expand Down
3 changes: 2 additions & 1 deletion app/repositories/service_usage_event_repository.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class ServiceUsageEventRepository
DELETED_EVENT_STATE = 'DELETED'.freeze
CREATED_EVENT_STATE = 'CREATED'.freeze
UPDATED_EVENT_STATE = 'UPDATED'.freeze
WAS_RUNNING_EVENT_STATE = 'WAS_RUNNING'.freeze

def find(guid)
ServiceUsageEvent.find(guid:)
Expand Down Expand Up @@ -92,7 +93,7 @@ def purge_and_reseed_service_instances!
end

def delete_events_older_than(cutoff_age_in_days)
Database::OldRecordCleanup.new(ServiceUsageEvent, cutoff_age_in_days, keep_at_least_one_record: true).delete
Database::OldRecordCleanup.new(ServiceUsageEvent, cutoff_age_in_days: cutoff_age_in_days, keep_at_least_one_record: true, keep_running_records: true).delete
end
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
Sequel.migration do
no_transaction # to use the 'concurrently' option

up do
if database_type == :postgres
VCAP::Migration.with_concurrent_timeout(self) do
add_index :app_usage_events, %i[state app_guid id],
name: :app_usage_events_lifecycle_index,
if_not_exists: true,
concurrently: true

add_index :service_usage_events, %i[state service_instance_guid id],
name: :service_usage_events_lifecycle_index,
if_not_exists: true,
concurrently: true
end

elsif database_type == :mysql
alter_table :app_usage_events do
# rubocop:disable Sequel/ConcurrentIndex
add_index %i[state app_guid id], name: :app_usage_events_lifecycle_index unless @db.indexes(:app_usage_events).include?(:app_usage_events_lifecycle_index)
# rubocop:enable Sequel/ConcurrentIndex
end

alter_table :service_usage_events do
# rubocop:disable Sequel/ConcurrentIndex
unless @db.indexes(:service_usage_events).include?(:service_usage_events_lifecycle_index)
add_index %i[state service_instance_guid id],
name: :service_usage_events_lifecycle_index
end
# rubocop:enable Sequel/ConcurrentIndex
end
end
end

down do
if database_type == :postgres
VCAP::Migration.with_concurrent_timeout(self) do
drop_index :app_usage_events, %i[state app_guid id],
name: :app_usage_events_lifecycle_index,
if_exists: true,
concurrently: true

drop_index :service_usage_events, %i[state service_instance_guid id],
name: :service_usage_events_lifecycle_index,
if_exists: true,
concurrently: true
end
end

if database_type == :mysql
alter_table :app_usage_events do
# rubocop:disable Sequel/ConcurrentIndex
drop_index %i[state app_guid id], name: :app_usage_events_lifecycle_index if @db.indexes(:app_usage_events).include?(:app_usage_events_lifecycle_index)
# rubocop:enable Sequel/ConcurrentIndex
end

alter_table :service_usage_events do
# rubocop:disable Sequel/ConcurrentIndex
if @db.indexes(:service_usage_events).include?(:service_usage_events_lifecycle_index)
drop_index %i[state service_instance_guid id],
name: :service_usage_events_lifecycle_index
end
# rubocop:enable Sequel/ConcurrentIndex
end
end
end
end
23 changes: 23 additions & 0 deletions db/migrations/20260601120100_seed_was_running_app_usage_events.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
require 'database/was_running_backfill'

Sequel.migration do
no_transaction # backfill manages its own per-batch transactions

up do
logger = Steno.logger('cc.backfill.was_running')
if VCAP::WasRunningBackfill.skip?
VCAP::WasRunningBackfill.log_skip(logger, 'app')
else
VCAP::WasRunningBackfill.seed_app_usage_events(self, logger)
end
end

down do
# Deliberately a no-op. Consumers may already have read the seeded rows,
# and deleting a row cannot make a consumer un-read it -- it would only
# leave any later STOPPED events without a start event to pair with.
# Leaving the rows is safe: re-running the migration or the
# 'db:was_running_backfill' rake task skips resources that already have a
# baseline.
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
require 'database/was_running_backfill'

Sequel.migration do
no_transaction # backfill manages its own per-batch transactions

up do
logger = Steno.logger('cc.backfill.was_running')
if VCAP::WasRunningBackfill.skip?
VCAP::WasRunningBackfill.log_skip(logger, 'service')
else
VCAP::WasRunningBackfill.seed_service_usage_events(self, logger)
end
end

down do
# Deliberately a no-op. Consumers may already have read the seeded rows,
# and deleting a row cannot make a consumer un-read it -- it would only
# leave any later DELETED events without a start event to pair with.
# Leaving the rows is safe: re-running the migration or the
# 'db:was_running_backfill' rake task skips instances that already have a
# baseline.
end
end
26 changes: 26 additions & 0 deletions db/migrations/20260601120300_seed_was_running_task_usage_events.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
require 'database/was_running_backfill'

Sequel.migration do
no_transaction # backfill manages its own per-batch transactions

up do
logger = Steno.logger('cc.backfill.was_running')
if VCAP::WasRunningBackfill.skip?
VCAP::WasRunningBackfill.log_skip(logger, 'task')
else
VCAP::WasRunningBackfill.seed_task_usage_events(self, logger)
end
end

down do
# Deliberately a no-op. Consumers may already have read the seeded rows,
# and deleting a row cannot make a consumer un-read it -- it would only
# leave any later TASK_STOPPED events without a start event to pair with.
# Worse: a task's stop event is only written when the task has recorded
# start evidence, and these rows ARE that evidence for tasks whose
# TASK_STARTED the cleanup already deleted. Remove them and those tasks'
# eventual stops are silently swallowed. Leaving the rows is safe:
# re-running the migration or the 'db:was_running_backfill' rake task
# skips tasks that already have a baseline.
end
end
2 changes: 2 additions & 0 deletions docs/v2/app_usage_events/list_all_app_usage_events.html
Original file line number Diff line number Diff line change
Expand Up @@ -631,9 +631,11 @@ <h4>Body</h4>
<ul class="valid_values">
<li>STARTED</li>
<li>STOPPED</li>
<li>WAS_RUNNING</li>
<li>BUILDPACK_SET</li>
<li>TASK_STARTED</li>
<li>TASK_STOPPED</li>
<li>TASK_WAS_RUNNING</li>
</ul>
</td>
<td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ <h4>Body</h4>
<li>CREATED</li>
<li>DELETED</li>
<li>UPDATED</li>
<li>WAS_RUNNING</li>
</ul>
</td>
<td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Content-Type: application/json

Destroys all existing events. Populates new usage events, one for each started app. All populated events will have a `created_at` value of current time. There is the potential race condition if apps are currently being started, stopped, or scaled. The seeded usage events will have the same guid as the app.

**Note:** the reseed only writes `STARTED` events for app processes — it does not restore the start evidence (`TASK_STARTED`/`TASK_WAS_RUNNING`) of currently-running tasks, and `TASK_STOPPED` events are only emitted for tasks with recorded start evidence. After a purge, operators should run `rake db:was_running_backfill` on a Cloud Controller VM to reseed baselines for running tasks; otherwise their eventual stops are silently suppressed.

#### Definition
`POST /v3/app_usage_events/actions/destructively_purge_all_and_reseed`

Expand Down
27 changes: 27 additions & 0 deletions docs/v3/source/includes/resources/app_usage_events/_object.md.erb
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,30 @@ Name | Type | Description
**instance_count.current** | _integer_ or `null` | Current instance count of the app that this event pertains to, if applicable
**instance_count.previous** | _integer_ or `null` | Previous instance count of the app that this event pertains to, if applicable
**links** | [_links object_](#links) | Links to related resources

#### WAS_RUNNING and TASK_WAS_RUNNING events

`WAS_RUNNING` and `TASK_WAS_RUNNING` are synthetic values for `state.current` recorded once per running process (`WAS_RUNNING`) and once per running task (`TASK_WAS_RUNNING`) by a one-time data migration when the keep-running cleanup feature was introduced. They mark every process and task that was already running at the time of the upgrade so that billing consumers can bootstrap from a complete baseline even if the original `STARTED`/`TASK_STARTED` events have been pruned.

**Consumer interpretation** (read `WAS_RUNNING`/`STARTED` as `TASK_WAS_RUNNING`/`TASK_STARTED` for task events, which are keyed by `task.guid`):

* If you have not previously recorded a `STARTED` event for this resource, treat `WAS_RUNNING` as equivalent to `STARTED`.
* If you have already recorded `STARTED` (or an earlier `WAS_RUNNING`) for this resource, treat as a redundant baseline confirmation and ignore.
* `created_at` reflects when the backfill migration ran, **not** when the app or task actually started. Treat `WAS_RUNNING` as a baseline marker that the resource was already running as of that timestamp, not as the true start of the running interval.
* `state.previous` on a `WAS_RUNNING` event is always `null`. Subsequent real events for the same resource will continue to report their actual prior process state in `state.previous` (typically `STARTED`). If you perform chain validation, treat `WAS_RUNNING` as equivalent to `STARTED` for the purpose of validating the next event's `state.previous`.

#### Repaired ending events

The backfill (and any later run of its recovery task) repairs baselines that turn out to be unpaired: if a `WAS_RUNNING`/`TASK_WAS_RUNNING` event was recorded for a resource that is no longer running and no later ending event exists for it — for example because the resource stopped while the backfill was still in progress — the missing ending event (`STOPPED`/`TASK_STOPPED`) is appended. Baselines are never deleted. A repaired ending event:

* carries a `created_at` of when the repair ran, **not** when the resource actually stopped — the interval it closes may overstate the true run by that gap;
* copies the footprint (`instance_count`, `memory_in_mb_per_instance`) of the baseline it pairs;
* reports the baseline's state (`WAS_RUNNING`/`TASK_WAS_RUNNING`) in `state.previous`, which normal ending events never carry — use this to tell repaired endings apart.

#### What a consumer must do

Independent of the backfill, the events stream asks three things of any consumer that pairs beginnings with endings:

1. Ignore a `WAS_RUNNING`/`TASK_WAS_RUNNING` event for a resource you already track (see above).
2. Tolerate duplicate ending events for the same resource: close the interval on the first ending after a beginning and ignore further endings until the next beginning.
3. Treat an ending event with no visible beginning for that resource as noise.
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,25 @@ Name | Type | Description
**service_broker.guid** | _string_ or `null` | Unique identifier of the service broker that this event pertains to, if applicable
**service_broker.name** | _string_ or `null` | Name of the service broker that this event pertains to, if applicable
**links** | [_links object_](#links) | Links to related resources

#### WAS_RUNNING events

`WAS_RUNNING` is a synthetic value for `state` recorded once per existing service instance by a one-time data migration when the keep-running cleanup feature was introduced. It marks every service instance that existed at the time of the upgrade so that billing consumers can bootstrap from a complete baseline of service instances even if the original `CREATED` events have been pruned.

**Consumer interpretation:**

* If you have not previously recorded a `CREATED` event for this service instance, treat `WAS_RUNNING` as equivalent to `CREATED`.
* If you have already recorded `CREATED` (or an earlier `WAS_RUNNING`) for this instance, treat as a redundant baseline confirmation and ignore.
* `created_at` reflects when the backfill migration ran, **not** when the service instance was created. Treat `WAS_RUNNING` as a baseline marker that the instance already existed as of that timestamp.

#### Repaired ending events

The backfill (and any later run of its recovery task) repairs baselines that turn out to be unpaired: if a `WAS_RUNNING` event was recorded for a service instance that no longer exists and no later `DELETED` event exists for it — for example because the instance was deleted while the backfill was still in progress — the missing `DELETED` event is appended, copying the baseline's instance, plan, and broker attributes. Baselines are never deleted. A repaired `DELETED` event carries a `created_at` of when the repair ran, **not** when the instance was actually deleted — the interval it closes may overstate the instance's true lifetime by that gap.

#### What a consumer must do

Independent of the backfill, the events stream asks three things of any consumer that pairs beginnings with endings:

1. Ignore a `WAS_RUNNING` event for a service instance you already track (see above).
2. Tolerate duplicate `DELETED` events for the same instance: close the interval on the first one and ignore the rest.
3. Treat a `DELETED` event with no visible `CREATED`/`UPDATED`/`WAS_RUNNING` for that instance as noise.
1 change: 1 addition & 0 deletions lib/cloud_controller/config_schemas/api_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ class ApiSchema < VCAP::Config
optional(:migration_psql_concurrent_statement_timeout_in_seconds) => Integer,
optional(:migration_psql_worker_memory_kb) => Integer,
optional(:skip_bigint_id_migration) => bool,
optional(:skip_was_running_backfill) => bool,
db: {
optional(:database) => Hash, # db connection hash for sequel
max_connections: Integer, # max connections in the connection pool
Expand Down
1 change: 1 addition & 0 deletions lib/cloud_controller/config_schemas/migrate_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class MigrateSchema < VCAP::Config
optional(:migration_psql_concurrent_statement_timeout_in_seconds) => Integer,
optional(:migration_psql_worker_memory_kb) => Integer,
optional(:skip_bigint_id_migration) => bool,
optional(:skip_was_running_backfill) => bool,

db: {
optional(:database) => Hash, # db connection hash for sequel
Expand Down
Loading
Loading