dbekaert · jlmaurer · Jan 12, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 * [753](https://github.com/dbekaert/RAiDER/pull/753) - Removed support for ERA-Interim.
 
 ### Changed
+* [791](https://github.com/dbekaert/RAiDER/pull/791) - Updated warning message to reflect the number of dropped stations due to NaN values, and adjust logging lines for `raiderCombine.py` workflow such that every message is not a warning.
 * [788](https://github.com/dbekaert/RAiDER/pull/788) - Streamlined the logic for handling stations where σ_model² < 0, specifically by default such cases are set to nan, but a user may clamp σ_model² to 0 instead by specifying the `--no-allow-nan` for `raiderCombine.py`.
 * [743](https://github.com/dbekaert/RAiDER/pull/743) - Switched from HTTPS to DAP4 for retrieving MERRA2 data, and suppressed a warning for using DAP4 for GMAO data where doing so is not possible.
 

diff --git a/tools/RAiDER/gnss/processDelayFiles.py b/tools/RAiDER/gnss/processDelayFiles.py
@@ -355,11 +355,22 @@ def variance_analysis(
     else:
         station_start = pd.NaT
         station_end = pd.NaT
+        logger.warning(
+            "Flagged station %s with invalid station start and/or end "
+            "date(s). Refer to unique dates here: %s.",
+            group.name,
+            n_unique_days,
+        )
 
     if n_global_days is not None and n_global_days > 0:
         coverage_pct = (n_unique_days / n_global_days) * 100.0
     else:
         coverage_pct = np.nan
+        logger.warning(
+            "Flagged station %s with no valid sampled days %s.",
+            group.name,
+            n_global_days,
+        )
 
     # Mean-squared terms
     # σ_res² = D[r] = E((r - E(r))²)
@@ -375,22 +386,45 @@ def variance_analysis(
         else np.nan
     )
 
+    # report warnings associated with insufficient sampling
+    if not n_epochs > 1:
+        logger.warning(
+            "Flagged station %s with insufficient valid epochs %d, so σ_res² "
+            "& σ_GNSS² set to NaN",
+            group.name,
+            n_epochs,
+        )
+    if not len(sig) > 0:
+        logger.warning(
+            "Flagged station %s with insufficient valid sigZTD observations "
+            "%d, so σ_mean² set to NaN",
+            group.name,
+            len(sig),
+        )
+
     # Mean bias calculation
     mean_bias = resid.mean()
 
     # Model variance computation
+    sigma_model_neg = False
     if np.isfinite(sigma_res_sq) and np.isfinite(sigma_gnss_sq):
         # σ_wm² = σ_res² - σ_gnss²
         diff = sigma_res_sq - sigma_gnss_sq
         negative_diff = diff < 0
 
         if negative_diff:
+            sigma_model_neg = True
             logger.warning(
-                f"Flagged station {group.name} with negative sigma values, "
-                f"with mean bias {mean_bias}, mean σ_wm² {diff}, "
-                f"with {n_unique_days} unique days sampled which translates "
-                f"to {coverage_pct}% daily overlap with the "
-                f"input dataset timespan."
+                "Flagged station %s with negative sigma values, "
+                "with mean bias %s, "
+                "mean σ_wm² %s, with %d unique days sampled "
+                "which translates to %.2f%% daily overlap with the "
+                "input dataset timespan.",
+                group.name,
+                mean_bias,
+                diff,
+                n_unique_days,
+                coverage_pct,
             )
         sigma_model_sq = (
             np.nan
@@ -410,15 +444,23 @@ def variance_analysis(
     else:
         sigma_mean_bias = np.nan
 
-    def first_non_null(series: pd.Series):
+    def first_non_null(series: pd.Series, label: str):
         vals = series.dropna()
-        return vals.iloc[0] if not vals.empty else np.nan
+        if not vals.empty:
+            return vals.iloc[0]
+        else:
+            logger.warning(
+                "Flagged station %s with no valid %s data.",
+                group.name,
+                label,
+            )
+            return np.nan
 
     data_series = {
         "ID": group.name,
-        "Lat": first_non_null(group["Lat"]),
-        "Lon": first_non_null(group["Lon"]),
-        "Hgt_m": first_non_null(group["Hgt_m"]),
+        "Lat": first_non_null(group["Lat"], "Lat"),
+        "Lon": first_non_null(group["Lon"], "Lon"),
+        "Hgt_m": first_non_null(group["Hgt_m"], "Hgt_m"),
         "Datetime": station_start,
         "Enddate_Datetime": station_end,
         "sigZTD": group["sigZTD"].median(),
@@ -433,6 +475,7 @@ def first_non_null(series: pd.Series):
         "n_epochs": n_epochs,
         "n_unique_days": n_unique_days,
         "pct_days_global": coverage_pct,
+        "sigma_model_neg": sigma_model_neg,
     }
 
     # Only parse Localtime if present
@@ -721,16 +764,20 @@ def main(
         filt_len = len(dfc)
         errlimit_dropped = prefilt_len - filt_len
         logger.warning(
-            f"Dropped {errlimit_dropped} observations with sigZTD > input "
-            f"{obs_errlimit} ({filt_len} remaining)."
+            "Dropped %d observations with sigZTD > input %s (%d remaining).",
+            errlimit_dropped,
+            obs_errlimit,
+            filt_len,
         )
 
     # get temporal sampling stats
     mean_delta_days, mode_delta_days = sampling_delta_stats(dfc)
 
-    logger.warning(
-        f"Global mean delta (days): {mean_delta_days} "
-        f"Global mode delta (days): {mode_delta_days}"
+    logger.info(
+        "Global mean delta (days): %s "
+        "Global mode delta (days): %s",
+        mean_delta_days,
+        mode_delta_days,
     )
 
     # determine coverage window across all retained observations
@@ -743,12 +790,14 @@ def main(
         n_global_days_total_span = (global_end - global_start).days + 1
         # capture reference, maximum temporal sampling
         # using mean time delta of all observations
-        n_global_days = int(n_global_days_total_span  / mean_delta_days)
-        logger.warning(
-            "The earliest/latest dates found are "
-            f"{global_start} & {global_end} "
-            f"which spans {n_global_days_total_span} days, with "
-            f"an average sampling of {n_global_days} days"
+        n_global_days = math.ceil(n_global_days_total_span  / mode_delta_days)
+        logger.info(
+            "The earliest/latest dates found are %s & %s which spans %d days, "
+            "with a sampling mode of %d days",
+            global_start,
+            global_end,
+            n_global_days_total_span,
+            n_global_days,
         )
 
     dfc_qm = (
@@ -765,10 +814,37 @@ def main(
         .reset_index(drop=True)
     )
 
-    # Drop all lines with NaNs and duplicates
+    # compute statistics for flagged `sigma_model_neg
+    dfc_qm.drop_duplicates(inplace=True)
     n_before = len(dfc_qm)
+    sigma_model_filt_len = dfc_qm["sigma_model_neg"].sum()
+    if sigma_model_filt_len > 0:
+        df_neg = dfc_qm.loc[dfc_qm["sigma_model_neg"]]
+        stats = df_neg[
+            ["n_unique_days", "pct_days_global"]
+        ].agg(["mean", "median", "min", "max"])
+
+        logger.warning(
+            "Negative σ_model² for %d stations. "
+            "Unique days sampled: mean=%.2f, median=%.2f, "
+            "min=%.2f, max=%.2f. "
+            "Daily overlap: mean=%.2f%%, median=%.2f%%, "
+            "min=%.2f%%, max=%.2f%%.",
+            len(df_neg),
+            stats.loc["mean", "n_unique_days"],
+            stats.loc["median", "n_unique_days"],
+            stats.loc["min", "n_unique_days"],
+            stats.loc["max", "n_unique_days"],
+            stats.loc["mean", "pct_days_global"],
+            stats.loc["median", "pct_days_global"],
+            stats.loc["min", "pct_days_global"],
+            stats.loc["max", "pct_days_global"],
+        )
+
+    # Drop all lines with NaNs and duplicates
+    dfc_qm = dfc_qm.drop(columns=["sigma_model_neg"])
     dfc_qm.dropna(how="any", inplace=True)
-    dfc_qm.drop_duplicates(inplace=True)
+    nan_filt_len = n_before - len(dfc_qm)
 
     if min_pct_days > 0:
         before_filter = len(dfc_qm)
@@ -782,18 +858,45 @@ def main(
             len(dfc_qm),
         )
 
-    if allow_nan_for_negative:
-        n_flagged = n_before - len(dfc_qm)
+    if not allow_nan_for_negative:
+        n_flagged = (dfc_qm["sigma_model"] == 0).sum()
         logger.warning(
-            f"Dropped {n_flagged} stations containing NaN sigma values "
-            f"({len(dfc_qm)} remaining)."
+            "%d/%d stations contain 0 sigma values.",
+            n_flagged,
+            len(dfc_qm),
         )
-    else:
-        n_flagged = (dfc_qm["sigma_model"] == 0).sum()
+
+    if nan_filt_len > 0:
         logger.warning(
-            f"{n_flagged}/{len(dfc_qm)} stations contain 0 sigma values."
+            "Dropped %d stations containing NaN due to all imposed filters "
+            "(see other warnings above for more details), "
+            "(%d) stations remaining.",
+            nan_filt_len,
+            len(dfc_qm),
         )
 
+    # capture final stats
+    stats = dfc_qm[
+        ["n_unique_days", "pct_days_global"]
+    ].agg(["mean", "median", "min", "max"])
+
+    logger.info(
+        "%d kept stations. "
+        "Unique days sampled: mean=%.2f, median=%.2f, "
+        "min=%.2f, max=%.2f. "
+        "Daily overlap: mean=%.2f%%, median=%.2f%%, "
+        "min=%.2f%%, max=%.2f%%.",
+        len(dfc_qm),
+        stats.loc["mean", "n_unique_days"],
+        stats.loc["median", "n_unique_days"],
+        stats.loc["min", "n_unique_days"],
+        stats.loc["max", "n_unique_days"],
+        stats.loc["mean", "pct_days_global"],
+        stats.loc["median", "pct_days_global"],
+        stats.loc["min", "pct_days_global"],
+        stats.loc["max", "pct_days_global"],
+    )
+
     dfc_qm.to_csv(
         out_path,
         index=False,