Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions src/make_and_fit_model.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,33 @@
"""
make_and_fit_model(data; n_particles=8, smc_data_proportion=0.1, n_mcmc=200, n_hmc=50, kwargs...)
_stabilize_for_fit(y; flat_threshold=1.0e-3)

Guard against a degenerate (near-constant) transformed series before fitting.

Gaussian process fitting needs the data to have some spread: `AutoGP` rescales `y`
by its range, so a genuinely flat series makes the covariance singular
(`PosDefException`, issue #51). When the relative range of `y`
(`(max - min) / (|mean| + 1)`) falls below `flat_threshold`, add small Gaussian
jitter so the series is fittable; otherwise return `y` unchanged. Operates on the
(already transformed) values, so it is independent of which transformation was used.

This is a secondary safety net for data that is flat even after transformation. The
common case of a pathological Box-Cox λ on otherwise-fittable data is handled earlier,
in [`get_transformations`](@ref), by falling back to a log transformation.
"""
function _stabilize_for_fit(y::AbstractVector{<:Real}; flat_threshold = 1.0e-3)
n = length(y)
n > 1 || return y
scale = abs(sum(y) / n) + 1
rel_range = (maximum(y) - minimum(y)) / scale
rel_range >= flat_threshold && return y # enough spread → leave untouched
σ = flat_threshold * scale
@warn "Near-constant series (relative range $rel_range < $flat_threshold); adding \
jitter (σ = $σ) so the GP covariance stays positive-definite (issue #51)."
return y .+ σ .* randn(n)
end

"""
make_and_fit_model(data; n_particles=8, smc_data_proportion=0.1, n_mcmc=200, n_hmc=50, flat_threshold=1.0e-3, kwargs...)

Create and fit a Gaussian Process (GP) model using Sequential Monte Carlo (SMC) sampling.

Expand All @@ -9,17 +37,21 @@ Create and fit a Gaussian Process (GP) model using Sequential Monte Carlo (SMC)
- `smc_data_proportion`: The proportion of the data to use in each SMC step (default: 0.1).
- `n_mcmc`: The number of MCMC samples (default: 200).
- `n_hmc`: The number of HMC samples (default: 50).
- `flat_threshold`: If the (transformed) target's relative range is below this, small
Gaussian jitter is added so the GP covariance stays positive-definite (default: 1.0e-3).
Series with enough spread are left untouched, preserving existing behaviour.
- `kwargs...`: Additional keyword arguments to pass to the `AutoGP.fit_smc!` function.

# Returns
- `model`: The fitted GP model.
"""
function make_and_fit_model(
data::TData; n_particles = 8, smc_data_proportion = 0.1,
n_mcmc = 200, n_hmc = 50, kwargs...
n_mcmc = 200, n_hmc = 50, flat_threshold = 1.0e-3, kwargs...
)
n_train = length(data.y)
model = AutoGP.GPModel(data.ds, data.y; n_particles = n_particles)
y_fit = _stabilize_for_fit(data.y; flat_threshold = flat_threshold)
model = AutoGP.GPModel(data.ds, y_fit; n_particles = n_particles)
# Ensure smc_data_proportion results in at least step=1 for the schedule
effective_proportion = max(smc_data_proportion, 1.0 / n_train)
schedule = AutoGP.Schedule.linear_schedule(n_train, effective_proportion)
Expand Down
20 changes: 19 additions & 1 deletion src/transformations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ A tuple `(forward_transform, inverse_transform)` where:
- **Forward**: `y ↦ BoxCox_λ(y + offset)` where λ is automatically fitted
- **Inverse**: Custom inverse function handling edge cases for numerical stability
- **Note**: Automatically determines optimal λ parameter via maximum likelihood
- **Fallback**: On near-constant data the Box-Cox MLE can return a pathological λ that
collapses the transform to a constant (singular GP covariance, issue #51). When the
transformed spread degenerates relative to a plain log, this falls back to the
`"positive"` (log) transformation and emits a warning.

# Offset Calculation
An offset is automatically calculated using `_get_offet(values)`:
Expand Down Expand Up @@ -146,8 +150,22 @@ function get_transformations(
return (y -> log(y + offset), y -> max(exp(y) - offset, zero(F)))
elseif transform_name == "boxcox"
max_values = maximum(values)
bc = fit(BoxCoxTransformation, values .+ offset) # Fit Box-Cox transformation
shifted = values .+ offset
bc = fit(BoxCoxTransformation, shifted) # Fit Box-Cox transformation
λ = bc.λ
transformed = bc.(shifted)
bc_range = maximum(transformed) - minimum(transformed)
log_range = maximum(log, shifted) - minimum(log, shifted)
# On near-constant data the Box-Cox MLE can pick a pathological λ (e.g.
# λ ≈ -178 for large counts), which underflows the transform to a constant
# and makes the GP covariance singular (issue #51). When the Box-Cox spread
# collapses relative to a plain log, fall back to the well-behaved "positive"
# (log) transformation, which fits this data fine.
if !all(isfinite, transformed) || bc_range <= 1.0e-2 * log_range
@warn "Box-Cox transformation degenerate (λ = $λ, transformed range = \
$bc_range); falling back to log transformation (issue #51)."
return get_transformations("positive", values)
end
@info "Using Box-Cox transformation with λ = $λ and offset = $offset"
return (y -> bc(y + offset), _inv_boxcox(λ, offset, max_values))
else
Expand Down
21 changes: 21 additions & 0 deletions test/test_helper_functions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,27 @@ end
end
end

@testitem "get_transformations BoxCox Fallback on Flat Data" setup = [DataSnippet] begin
# Near-constant data makes the Box-Cox MLE pick a pathological λ that collapses the
# transform; get_transformations falls back to log instead (issue #51).
flat_values = [
75000.0, 75100.0, 74950.0, 75050.0, 75000.0,
74980.0, 75020.0, 75010.0, 74990.0, 75005.0,
]
forward_transform, inverse_transform = get_transformations("boxcox", flat_values)

# Fallback ⟹ forward transform is log (offset = 0 for these positive values).
@test forward_transform(flat_values[1]) ≈ log(flat_values[1]) rtol = 1.0e-9
# Round-trip still holds on the original scale.
for val in flat_values
@test inverse_transform(forward_transform(val)) ≈ val rtol = 1.0e-9
end

# Healthy, well-spread data should NOT fall back (genuine Box-Cox, not log).
healthy_forward, _ = get_transformations("boxcox", values)
@test !isapprox(healthy_forward(values[1]), log(values[1]); rtol = 1.0e-9)
end

@testitem "get_transformations Percentage with data with zeros" setup = [DataSnippet] begin
forward_transform,
inverse_transform = get_transformations("percentage", values_with_zero)
Expand Down
58 changes: 58 additions & 0 deletions test/test_model_fitting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,61 @@ end
model = make_and_fit_model(data; smc_data_proportion = 0.05, minimal_params...)
@test model isa AutoGP.GPModel
end

@testsnippet FlatData begin
using AutoGP, Dates, Random

flat_dates = collect(Date(2024, 1, 1):Day(1):Date(2024, 1, 10))
# Near-constant counts: the Box-Cox MLE picks a pathological λ here (issue #51).
flat_values = [
75000.0, 75100.0, 74950.0, 75050.0, 75000.0,
74980.0, 75020.0, 75010.0, 74990.0, 75005.0,
]
# Exactly constant: degenerate even after any monotonic transform.
const_values = fill(75000.0, length(flat_dates))
flat_forecast_dates = collect(Date(2024, 1, 11):Day(1):Date(2024, 1, 18))
minimal_params = (n_particles = 1, n_mcmc = 5, n_hmc = 3)
end

@testitem "make_and_fit_model+forecast Box-Cox-degenerate flat data (issue #51)" setup = [FlatData] begin
# Regression: this previously threw PosDefException at forecast time.
Random.seed!(51)
transformation, inv_transformation = get_transformations("boxcox", flat_values)
data = create_transformed_data(flat_dates, flat_values; transformation)
model = make_and_fit_model(data; smc_data_proportion = 0.5, minimal_params...)
@test model isa AutoGP.GPModel

fc = forecast(model, flat_forecast_dates, 25; inv_transformation = inv_transformation)
@test size(fc) == (length(flat_forecast_dates), 25)
@test all(isfinite, fc)
@test all(fc .>= 0)
@test 50_000 < sum(fc) / length(fc) < 100_000 # forecasts stay near ~75,000
end

@testitem "make_and_fit_model+forecast exactly constant data (issue #51)" setup = [FlatData] begin
# Genuinely flat data is rescued by the jitter safety net in make_and_fit_model.
Random.seed!(52)
transformation, inv_transformation = get_transformations("boxcox", const_values)
data = create_transformed_data(flat_dates, const_values; transformation)
model = make_and_fit_model(data; smc_data_proportion = 0.5, minimal_params...)
@test model isa AutoGP.GPModel

fc = forecast(model, flat_forecast_dates, 25; inv_transformation = inv_transformation)
@test size(fc) == (length(flat_forecast_dates), 25)
@test all(isfinite, fc)
@test all(fc .>= 0)
end

@testitem "_stabilize_for_fit leaves healthy data unchanged" setup = [FlatData] begin
healthy = [10.0, 15.0, 12.0, 18.0, 22.0, 25.0, 20.0, 16.0, 14.0, 11.0]
@test NowcastAutoGP._stabilize_for_fit(healthy) === healthy
end

@testitem "_stabilize_for_fit jitters near-constant data" setup = [FlatData] begin
Random.seed!(1)
flat = fill(11.2256, 30)
jittered = NowcastAutoGP._stabilize_for_fit(flat)
@test jittered != flat
rel_range = (maximum(jittered) - minimum(jittered)) / (abs(sum(jittered) / length(jittered)) + 1)
@test rel_range >= 1.0e-3
end
Loading