From a691392ffdc1a3a1c994ca7bc895257b6fca2fdd Mon Sep 17 00:00:00 2001 From: David LeBauer Date: Wed, 4 Mar 2026 09:40:14 -0700 Subject: [PATCH] Harden restart checkpoint validation and regression tests --- docs/developer-guide/restart-checkpoint.md | 8 +- src/sipnet/restart.c | 68 ++++++++++++++- .../testRestartMVP.c | 82 +++++++++++++++++++ 3 files changed, 152 insertions(+), 6 deletions(-) diff --git a/docs/developer-guide/restart-checkpoint.md b/docs/developer-guide/restart-checkpoint.md index a89c7016..130aa595 100644 --- a/docs/developer-guide/restart-checkpoint.md +++ b/docs/developer-guide/restart-checkpoint.md @@ -29,11 +29,15 @@ Checkpoint format is ASCII text with one key/value per line: - header: `SIPNET_RESTART 1.0` - metadata: `model_version`, `build_info`, `checkpoint_utc_epoch`, `processed_steps` - schema layout guard metadata: `schema_layout.envi_size`, `schema_layout.trackers_size`, `schema_layout.phenology_trackers_size`, `schema_layout.event_trackers_size` + - `schema_layout.trackers_size` guards the serialized tracker payload shape for schema v1.0 - mode flags: `flags.*` - boundary metadata: `boundary.year`, `boundary.day`, `boundary.time`, `boundary.length` (no forcing fields, no cumulative GDD) - mean tracker metadata: `mean.npp.*` -- full runtime state: `envi.*`, `trackers.*`, `phenology.*`, `event_trackers.*` +- full runtime state: `envi.*`, serialized `trackers.*`, `phenology.*`, `event_trackers.*` - includes `trackers.gdd` for year-to-date cumulative GDD continuity + - excludes step-level diagnostics that are recomputed on the next timestep: + `trackers.methane`, `trackers.nLeaching`, `trackers.nFixation`, + `trackers.nUptake` - mean ring buffers: `mean.npp.values.length` + `mean.npp.values.`, `mean.npp.weights.length` + `mean.npp.weights.` - end marker: `end_restart 1` @@ -77,7 +81,7 @@ When `--gdd` is enabled, checkpoint resume restores cumulative GDD from `tracker Restart schema v1.0 includes compile-time and runtime drift guards so struct layout changes cannot silently pass: -- Compile-time guards: `_Static_assert` checks in `src/sipnet/restart.c` for `Envi`, `Trackers`, `PhenologyTrackers`, and `EventTrackers`. +- Compile-time guards: `_Static_assert` checks in `src/sipnet/restart.c` for `Envi`, serialized tracker payload shape, `PhenologyTrackers`, and `EventTrackers`. - Runtime guards: `schema_layout.*` fields in each checkpoint are validated on load. - Test guardrails: `tests/sipnet/test_restart_infrastructure/testRestartMVP.c` verifies schema layout keys are present and rejects tampered values. diff --git a/src/sipnet/restart.c b/src/sipnet/restart.c index c1930bf5..36365ce9 100644 --- a/src/sipnet/restart.c +++ b/src/sipnet/restart.c @@ -1,5 +1,6 @@ #include "restart.h" +#include #include #include #include @@ -22,12 +23,44 @@ #define RESTART_SCHEMA_LAYOUT_PHENOLOGY_TRACKERS_SIZE 12 #define RESTART_SCHEMA_LAYOUT_EVENT_TRACKERS_SIZE 8 +typedef struct RestartSerializedTrackersV1 { + double gpp; + double rtot; + double ra; + double rh; + double npp; + double nee; + double yearlyGpp; + double yearlyRtot; + double yearlyRa; + double yearlyRh; + double yearlyNpp; + double yearlyNee; + double totGpp; + double totRtot; + double totRa; + double totRh; + double totNpp; + double totNee; + double evapotranspiration; + double soilWetnessFrac; + double rRoot; + double rSoil; + double rAboveground; + double yearlyLitter; + double woodCreation; + double n2o; + double gdd; + int lastYear; +} RestartSerializedTrackersV1; + _Static_assert(sizeof(Envi) == RESTART_SCHEMA_LAYOUT_ENVI_SIZE, "Restart schema 1.0 drift: Envi changed; bump restart schema " "version and update schema_layout.* checks"); -_Static_assert(sizeof(Trackers) == RESTART_SCHEMA_LAYOUT_TRACKERS_SIZE, - "Restart schema 1.0 drift: Trackers changed; bump restart " - "schema version and update schema_layout.* checks"); +_Static_assert(sizeof(RestartSerializedTrackersV1) == + RESTART_SCHEMA_LAYOUT_TRACKERS_SIZE, + "Restart schema 1.0 drift: serialized trackers payload changed; " + "bump restart schema version and update schema_layout.* checks"); _Static_assert( sizeof(PhenologyTrackers) == RESTART_SCHEMA_LAYOUT_PHENOLOGY_TRACKERS_SIZE, "Restart schema 1.0 drift: PhenologyTrackers changed; bump restart " @@ -146,6 +179,31 @@ validateCheckpointBoundaryForWrite(const char *restartOut, } } +static void +validateCheckpointBoundaryForLoad(const char *restartIn, + const RestartClimateSignature *boundary) { + double stepHours = boundary->length * 24.0; + if (stepHours <= RESTART_FLOAT_EPSILON) { + logError("Restart boundary mismatch in %s: checkpoint boundary has " + "non-positive timestep length (year=%d day=%d time=%.8f " + "length=%.8f)\n", + restartIn, boundary->year, boundary->day, boundary->time, + boundary->length); + exit(EXIT_CODE_BAD_PARAMETER_VALUE); + } + + double hoursUntilMidnight = 24.0 - boundary->time; + if (hoursUntilMidnight > (stepHours + RESTART_FLOAT_EPSILON)) { + logError( + "Restart boundary mismatch in %s: checkpoint boundary is more than " + "one timestep before midnight\n", + restartIn); + logError("Checkpoint boundary: year=%d day=%d time=%.8f length=%.8f\n", + boundary->year, boundary->day, boundary->time, boundary->length); + exit(EXIT_CODE_BAD_PARAMETER_VALUE); + } +} + static void sanitizeBuildInfo(char *dest, size_t destLen, const char *src) { if (destLen == 0) { return; @@ -184,8 +242,9 @@ static void parseValueError(const char *restartIn, const char *key, static long long parseLongLongStrict(const char *restartIn, const char *key, const char *value) { char *end = NULL; + errno = 0; long long parsed = strtoll(value, &end, 10); - if (end == value || *end != '\0') { + if (end == value || *end != '\0' || errno == ERANGE) { parseValueError(restartIn, key, value); } return parsed; @@ -1105,6 +1164,7 @@ void restartLoadCheckpoint(const char *restartIn, MeanTracker *meanNPP) { readRestartState(restartIn, &state, meanNPP); + validateCheckpointBoundaryForLoad(restartIn, &(state.boundaryClimate)); checkRestartContextCompatibility(&state); validateRestartModelBuild(&state); validateRestartBoundary(&state); diff --git a/tests/sipnet/test_restart_infrastructure/testRestartMVP.c b/tests/sipnet/test_restart_infrastructure/testRestartMVP.c index 1a879586..f8e248b3 100644 --- a/tests/sipnet/test_restart_infrastructure/testRestartMVP.c +++ b/tests/sipnet/test_restart_infrastructure/testRestartMVP.c @@ -475,6 +475,11 @@ static int testSegmentedEquivalence(void) { status |= !fileContains(CHECKPOINT_FILE, "mean.npp.length "); status |= !fileContains(CHECKPOINT_FILE, "mean.npp.values.length "); status |= !fileContains(CHECKPOINT_FILE, "mean.npp.weights.length "); + // These step-level diagnostics are intentionally omitted from restart schema. + status |= fileContains(CHECKPOINT_FILE, "trackers.methane "); + status |= fileContains(CHECKPOINT_FILE, "trackers.nLeaching "); + status |= fileContains(CHECKPOINT_FILE, "trackers.nFixation "); + status |= fileContains(CHECKPOINT_FILE, "trackers.nUptake "); status |= fileContains(CHECKPOINT_FILE, "balance."); status |= !previousNonEmptyLineBeforeStartsWith( CHECKPOINT_FILE, "end_restart 1", "mean.npp.weights."); @@ -565,6 +570,41 @@ static int testCheckpointMustEndNearMidnight(void) { return status; } +static int testTamperedBoundaryNotNearMidnightFails(void) { + int status = 0; + int stepStatus = 0; + int rc; + + runShell("rm -f run.out events.out run.restart *.log"); + + stepStatus = prepRunFiles("restart_segment1.clim", "events_segment1.in"); + if (stepStatus) { + logTest("Failed to prepare files for tampered-boundary test segment 1\n"); + return stepStatus; + } + status |= (runModel("restart_seg1.in", "tampered_boundary_seg1.log") != 0); + status |= replaceFirstLineStartingWith(CHECKPOINT_FILE, "boundary.time ", + "boundary.time 12.0\n"); + + stepStatus = prepRunFiles("restart_segment2.clim", "events_segment2.in"); + if (stepStatus) { + logTest("Failed to prepare files for tampered-boundary test segment 2\n"); + return status | stepStatus; + } + + rc = runModel("restart_seg2.in", "tampered_boundary_seg2.log"); + status |= (rc != EXIT_CODE_BAD_PARAMETER_VALUE); + status |= !fileContains( + "tampered_boundary_seg2.log", + "checkpoint boundary is more than one timestep before midnight"); + + if (status) { + logTest("testTamperedBoundaryNotNearMidnightFails failed (rc=%d)\n", rc); + } + + return status; +} + static int testRestartMustStartNearMidnight(void) { int status = 0; int stepStatus = 0; @@ -1097,6 +1137,46 @@ static int testTruncatedCheckpointFails(void) { return status; } +static int testProcessedStepsOverflowFails(void) { + int status = 0; + int stepStatus = 0; + int rc; + + runShell("rm -f run.out events.out run.restart *.log"); + + stepStatus = prepRunFiles("restart_segment1.clim", "events_segment1.in"); + if (stepStatus) { + logTest("Failed to prepare files for processed-steps overflow test " + "segment 1\n"); + return stepStatus; + } + status |= + (runModel("restart_seg1.in", "processed_steps_overflow_seg1.log") != 0); + status |= replaceFirstLineStartingWith( + CHECKPOINT_FILE, "processed_steps ", + "processed_steps 999999999999999999999999999999\n"); + + stepStatus = prepRunFiles("restart_segment2.clim", "events_segment2.in"); + if (stepStatus) { + logTest("Failed to prepare files for processed-steps overflow test " + "segment 2\n"); + return status | stepStatus; + } + + rc = runModel("restart_seg2.in", "processed_steps_overflow_seg2.log"); + status |= (rc != EXIT_CODE_BAD_PARAMETER_VALUE); + status |= + !fileContains("processed_steps_overflow_seg2.log", + "invalid value '999999999999999999999999999999' for key " + "'processed_steps'"); + + if (status) { + logTest("testProcessedStepsOverflowFails failed (rc=%d)\n", rc); + } + + return status; +} + static int testMalformedCheckpointFails(void) { int status = 0; int stepStatus = 0; @@ -1140,6 +1220,7 @@ int run(void) { status |= testSegmentedEquivalence(); status |= testStrictClimateMismatchFails(); status |= testCheckpointMustEndNearMidnight(); + status |= testTamperedBoundaryNotNearMidnightFails(); status |= testRestartMustStartNearMidnight(); status |= testRestartEventBoundaryRequiresSegmentedEvents(); status |= testMissingFinalNewlineCheckpointSucceeds(); @@ -1155,6 +1236,7 @@ int run(void) { status |= testLegacyBalanceKeyRejected(); status |= testBuildInfoMismatchWarnsAndSucceeds(); status |= testTruncatedCheckpointFails(); + status |= testProcessedStepsOverflowFails(); status |= testMalformedCheckpointFails(); return status;