From 677cca92f0d22a5efb96552a3bca34b769cc9e5f Mon Sep 17 00:00:00 2001 From: Don Boyd Date: Tue, 24 Mar 2026 13:04:11 -0400 Subject: [PATCH] Enhance quality report: auto-save, multiplier histograms, bystander analysis Major expansion of quality_report.py: - --output flag to auto-save report to file - Scope/timestamp header with solve time summary - Aggregate multiplier distribution across all areas - Weight distribution by AGI stub (national vs sum-of-areas) - Per-bin bystander analysis (targeted vs untargeted distortion) - Top-N per-area detail for large area sets (CDs, counties) - Auto-discover areas from log files - Support --scope cds Co-Authored-By: Claude Opus 4.6 (1M context) --- tmd/areas/quality_report.py | 825 ++++++++++++++++++++++++++++++++---- 1 file changed, 748 insertions(+), 77 deletions(-) diff --git a/tmd/areas/quality_report.py b/tmd/areas/quality_report.py index b18837df..ef4d6b69 100644 --- a/tmd/areas/quality_report.py +++ b/tmd/areas/quality_report.py @@ -1,21 +1,27 @@ -# pylint: disable=import-outside-toplevel,inconsistent-quotes +# pylint: disable=import-outside-toplevel,inconsistent-quotes,too-many-lines """ -Cross-state quality summary report. +Cross-area quality summary report. -Parses solver logs for all states and produces a summary showing: +Parses solver logs for all areas and produces a summary showing: - Solve status and timing - Target accuracy (hit rate, mean/max error) - Weight distortion (RMSE, percentiles) + - Aggregate multiplier distribution (old vs new weight comparison) - Violated targets by variable - - Weight exhaustion and cross-state aggregation diagnostics + - Weight exhaustion and cross-area aggregation diagnostics + - Bystander checks (untargeted variables + per-bin analysis) Usage: python -m tmd.areas.quality_report - python -m tmd.areas.quality_report --scope CA,WY + python -m tmd.areas.quality_report --scope cds + python -m tmd.areas.quality_report --scope cds --output + python -m tmd.areas.quality_report --scope CA,WY -o report.txt + python -m tmd.areas.quality_report --scope MN01,MN02 """ import argparse import re +from datetime import datetime from pathlib import Path import numpy as np @@ -23,9 +29,11 @@ from tmd.areas.create_area_weights import ( AREA_CONSTRAINT_TOL, + CD_TARGET_DIR, + CD_WEIGHT_DIR, + STATE_TARGET_DIR, STATE_WEIGHT_DIR, ) -from tmd.areas.prepare.constants import ALL_STATES from tmd.imputation_assumptions import TAXYEAR _WT_COL = f"WT{TAXYEAR}" @@ -77,6 +85,15 @@ def _humanize_desc(desc: str) -> str: return " ".join(pieces) +def _fmt_agi_bin(lo, hi): + """Format an AGI bin range for display.""" + if lo < -1e10: + return "<$0" + if hi > 1e10: + return f"${lo / 1000:.0f}K+" + return f"${lo / 1000:.0f}K-${hi / 1000:.0f}K" + + def parse_log(logpath: Path) -> dict: """Parse a single area solver log file into a summary dict.""" if not logpath.exists(): @@ -92,6 +109,11 @@ def parse_log(logpath: Path) -> dict: if "PrimalInfeasible" in log or "FAILED" in log: result["status"] = "FAILED" + # Population share + m = re.search(r"pop_share = ([\d.]+) / ([\d.]+) = ([\d.]+)", log) + if m: + result["pop_share"] = float(m.group(3)) + # Solve time m = re.search(r"Solve time: ([\d.]+)s", log) if m: @@ -174,18 +196,174 @@ def parse_log(logpath: Path) -> dict: return result -def generate_report(areas=None, weight_dir=None): - """Generate cross-state quality summary report.""" - if areas is None: - areas = ALL_STATES +def _list_areas_from_logs(weight_dir): + """Return sorted list of area codes from log files.""" + logs = sorted(weight_dir.glob("*.log")) + return [lp.stem for lp in logs] + + +def _infer_target_dir(weight_dir): + """Infer target directory from weight directory.""" + # weight_dir is like .../weights/states or .../weights/cds + return weight_dir.parent.parent / "targets" / weight_dir.name + + +def _aggregate_multiplier_histogram(solved_df): + """ + Aggregate per-area multiplier distributions into a combined view. + + The multiplier x = area_weight / (pop_share * national_weight). + x = 1.0 means the record keeps its population-proportional share. + This is the side-by-side comparison of initial (x=1) vs optimized + weights. + """ + lines = [] + + # Canonical bin order matching the solver's histogram + # Note: exact zeros are logged as [0.0000, 0.0000) → key (0.0, 0.0) + canonical_bins = [ + (0.0, 0.0), + (0.0, 0.1), + (0.1, 0.5), + (0.5, 0.8), + (0.8, 0.9), + (0.9, 0.95), + (0.95, 1.0), + (1.0, 1.05), + (1.05, 1.1), + (1.1, 1.2), + (1.2, 1.5), + (1.5, 2.0), + (2.0, 5.0), + (5.0, 10.0), + (10.0, 100.0), + (100.0, np.inf), + ] + # Aggregate counts from all areas + agg = {} + total_records = 0 + n_areas = 0 + for _, row in solved_df.iterrows(): + dist = row.get("dist_bins", {}) + n_rec = row.get("n_records", 0) + if not dist or n_rec == 0: + continue + n_areas += 1 + total_records += n_rec + for key, val in dist.items(): + agg[key] = agg.get(key, 0) + val["count"] + + if total_records == 0: + return lines + + n_per_area = total_records // n_areas if n_areas > 0 else 0 + lines.append( + "AGGREGATE MULTIPLIER DISTRIBUTION" + f" ({n_areas} areas x {n_per_area:,}" + f" records = {total_records:,} area-record pairs):" + ) + lines.append( + " The multiplier x = area_weight / (pop_share * national_weight)." + ) + lines.append( + " x = 1.0 means the record keeps its population-proportional" + " share." + ) + lines.append( + " Initial (pre-optimization): all x = 1.0." + " This histogram shows the optimized distribution." + ) + + bin_labels = { + (0.0, 0.0): "x = 0 (excluded)", + (0.0, 0.1): "(0, 0.1)", + (0.1, 0.5): "[0.1, 0.5)", + (0.5, 0.8): "[0.5, 0.8)", + (0.8, 0.9): "[0.8, 0.9)", + (0.9, 0.95): "[0.9, 0.95)", + (0.95, 1.0): "[0.95, 1.0)", + (1.0, 1.05): "[1.0, 1.05)", + (1.05, 1.1): "[1.05, 1.1)", + (1.1, 1.2): "[1.1, 1.2)", + (1.2, 1.5): "[1.2, 1.5)", + (1.5, 2.0): "[1.5, 2.0)", + (2.0, 5.0): "[2.0, 5.0)", + (5.0, 10.0): "[5.0, 10.0)", + (10.0, 100.0): "[10.0, 100.0)", + (100.0, np.inf): "[100.0, inf)", + } + + lines.append(f" {'Bin':<20} {'Count':>12} {'Pct':>8} {'CumPct':>8}") + lines.append(" " + "-" * 50) + cum_pct = 0.0 + for bkey in canonical_bins: + cnt = agg.get(bkey, 0) + if cnt == 0: + continue + pct = 100.0 * cnt / total_records + cum_pct += pct + label = bin_labels.get(bkey, str(bkey)) + lines.append(f" {label:<20} {cnt:>12,} {pct:>7.1f}% {cum_pct:>7.1f}%") + + # Summary stats + n_near_one = sum(agg.get(b, 0) for b in [(0.95, 1.0), (1.0, 1.05)]) + n_within_20 = sum( + agg.get(b, 0) + for b in [ + (0.8, 0.9), + (0.9, 0.95), + (0.95, 1.0), + (1.0, 1.05), + (1.05, 1.1), + (1.1, 1.2), + ] + ) + n_excluded = agg.get((0.0, 0.0), 0) + n_extreme = sum( + agg.get(b, 0) for b in [(5.0, 10.0), (10.0, 100.0), (100.0, np.inf)] + ) + lines.append("") + lines.append( + f" Within +/-5% of 1.0:" + f" {n_near_one:,} ({100 * n_near_one / total_records:.1f}%)" + ) + lines.append( + f" Within +/-20% of 1.0:" + f" {n_within_20:,} ({100 * n_within_20 / total_records:.1f}%)" + ) + lines.append( + f" Excluded (x=0):" + f" {n_excluded:,} ({100 * n_excluded / total_records:.1f}%)" + ) + if n_extreme > 0: + lines.append( + f" Extreme (x>=5.0):" + f" {n_extreme:,} ({100 * n_extreme / total_records:.1f}%)" + ) + lines.append("") + + return lines + + +def generate_report( + areas=None, + weight_dir=None, + target_dir=None, + scope_label=None, +): + """Generate cross-area quality summary report.""" if weight_dir is None: weight_dir = STATE_WEIGHT_DIR + if target_dir is None: + target_dir = _infer_target_dir(weight_dir) + if areas is None: + areas = _list_areas_from_logs(weight_dir) rows = [] for st in areas: logpath = weight_dir / f"{st.lower()}.log" info = parse_log(logpath) - info["state"] = st + info["area"] = st rows.append(info) df = pd.DataFrame(rows) @@ -193,7 +371,6 @@ def generate_report(areas=None, weight_dir=None): # Summary statistics solved = df[df["status"].isin(["Solved", "AlmostSolved"])] failed = df[df["status"] == "FAILED"] - n_states = len(df) n_solved = len(solved) n_failed = len(failed) n_violated_states = (solved["n_violated"] > 0).sum() @@ -203,40 +380,57 @@ def generate_report(areas=None, weight_dir=None): lines = [] lines.append("=" * 80) - lines.append("CROSS-STATE QUALITY SUMMARY REPORT") + header = "CROSS-AREA QUALITY SUMMARY REPORT" + if scope_label: + header += f" [{scope_label}]" + lines.append(header) + lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") lines.append("=" * 80) lines.append("") # Overall - lines.append(f"States: {n_states}") + n_areas = len(df) + lines.append(f"Areas: {n_areas}") lines.append(f"Solved: {n_solved}") lines.append(f"Failed: {n_failed}") if n_failed > 0: - lines.append(f" Failed: {', '.join(failed['state'].tolist())}") + lines.append(f" Failed: {', '.join(failed['area'].tolist())}") lines.append( - f"States with violated targets: {n_violated_states}/{n_solved}" + f"Areas with violated targets: {n_violated_states}/{n_solved}" ) if not solved.empty and "targets_total" in solved.columns: tpt = int(solved["targets_total"].iloc[0]) tpt_sum = int(solved["targets_total"].sum()) else: tpt, tpt_sum = "?", "?" - lines.append(f"Total targets: {n_solved} states \u00d7 {tpt} = {tpt_sum}") + lines.append(f"Total targets: {n_solved} areas \u00d7 {tpt} = {tpt_sum}") lines.append(f"Total violated targets: {int(total_violated)}") + if not solved.empty and "solve_time" in solved.columns: + cum_time = solved["solve_time"].sum() + avg_time = solved["solve_time"].mean() + lines.append( + f"Cumulative solve time: {cum_time:.0f}s" + f" (avg {avg_time:.1f}s per area;" + f" ~{cum_time / 16:.0f}s wall @ 16 workers)" + ) lines.append("") # Target accuracy if not solved.empty and "mean_err" in solved.columns: lines.append("TARGET ACCURACY:") lines.append( - f" Per-state mean error: " - f"avg across states={solved['mean_err'].mean():.4f}, " - f"worst state={solved['mean_err'].max():.4f}" + " Error = |achieved - target| / |target|." + " A target is 'hit' if error < tolerance." ) lines.append( - f" Per-state max error: " - f"avg across states={solved['max_err'].mean():.4f}, " - f"worst state={solved['max_err'].max():.4f}" + f" Per-area mean error:" + f"avg={solved['mean_err'].mean():.4f}, " + f"worst={solved['mean_err'].max():.4f}" + ) + lines.append( + f" Per-area max error: " + f"avg={solved['max_err'].mean():.4f}, " + f"worst={solved['max_err'].max():.4f}" ) if "targets_hit" in solved.columns: total_t = solved["targets_total"].iloc[0] @@ -253,6 +447,14 @@ def generate_report(areas=None, weight_dir=None): # Weight distortion if not solved.empty and "w_rmse" in solved.columns: lines.append("WEIGHT DISTORTION (multiplier from 1.0):") + lines.append( + " Each record's multiplier x = area_weight" + " / (pop_share * national_weight)." + ) + lines.append( + " x=1.0 means population-proportional." + " RMSE measures overall departure from x=1." + ) lines.append( f" RMSE: avg={solved['w_rmse'].mean():.3f}, " f"max={solved['w_rmse'].max():.3f}" @@ -280,6 +482,10 @@ def generate_report(areas=None, weight_dir=None): ) lines.append("") + # Aggregate multiplier distribution (old vs new weight comparison) + if not solved.empty: + lines.extend(_aggregate_multiplier_histogram(solved)) + # Near-zero weight summary if not solved.empty: zero_pcts = [] @@ -295,6 +501,10 @@ def generate_report(areas=None, weight_dir=None): lt01_pcts.append(100 * n_lt01 / n_rec) if zero_pcts: lines.append("NEAR-ZERO WEIGHT MULTIPLIERS (% of records):") + lines.append( + " Records with x near 0 are effectively" + " excluded from the area." + ) lines.append( f" Exact zero (x=0): " f"avg={np.mean(zero_pcts):.1f}%, " @@ -307,21 +517,68 @@ def generate_report(areas=None, weight_dir=None): ) lines.append("") - # Per-state table - lines.append("PER-STATE DETAIL:") + # Load TMD data and weight files (once, shared by all diagnostics) + report_data = _load_report_data(areas, weight_dir) + if report_data is not None: + tmd, s006, state_weights, n_loaded = report_data + # Weight distribution by AGI stub (old vs new) + lines.extend( + _weight_distribution_by_stub( + tmd, s006, state_weights, n_loaded, target_dir + ) + ) + + # Per-area table + # For small area counts (states), show all areas. + # For large area counts (CDs, counties), show top 20 by max error. + _DETAIL_CUTOFF = 60 + show_all = n_areas <= _DETAIL_CUTOFF + + if show_all: + lines.append("PER-AREA DETAIL:") + display_df = df + else: + lines.append( + "PER-AREA DETAIL (top 20 by violations / weight distortion):" + ) + # Always include failed areas, then sort solved by + # violations desc, then weight RMSE desc + failed_rows = df[df["status"] == "FAILED"] + solved_rows = df[df["status"].isin(["Solved", "AlmostSolved"])].copy() + solved_rows["_viol"] = solved_rows["n_violated"].fillna(0) + solved_rows["_rmse"] = solved_rows["w_rmse"].fillna(0) + top_solved = solved_rows.sort_values( + ["_viol", "_rmse"], ascending=[False, False] + ).head(20) + top_solved = top_solved.drop(columns=["_viol", "_rmse"]) + display_df = pd.concat([failed_rows, top_solved]).drop_duplicates( + subset=["area"] + ) + lines.append( - " Err cols = |relative error| (fraction); " - "weight cols = multiplier on national weight (1.0 = unchanged)" + " MeanErr/MaxErr = |relative error| (fraction)." + " wRMSE = root-mean-square deviation of" + " multipliers from 1.0" ) + lines.append( + " (higher = more distortion)." + " wP05/wMed/wP95/wMax = multiplier percentiles." + " %zero = records excluded." + ) + max_id = max( + (len(str(row["area"])) for _, row in display_df.iterrows()), + default=4, + ) + id_w = max(max_id + 1, 5) header = ( - f"{'St':<4} {'Status':<14} {'Hit':>5} {'Tot':>5} " + f"{'Area':<{id_w}} {'Status':<14} {'Hit':>5} {'Tot':>5} " f"{'Viol':>5} {'MeanErr':>8} {'MaxErr':>8} " f"{'wRMSE':>7} {'wP05':>7} {'wMed':>7} " f"{'wP95':>7} {'wMax':>8} {'%zero':>6}" ) lines.append(header) lines.append("-" * len(header)) - for _, row in df.iterrows(): + for _, row in display_df.iterrows(): hit = int(row.get("targets_hit", 0)) tot = int(row.get("targets_total", 0)) viol = int(row.get("n_violated", 0)) @@ -338,12 +595,16 @@ def generate_report(areas=None, weight_dir=None): if isinstance(dist, dict): n_zero = dist.get((0.0, 0.0), {}).get("count", 0) pct_zero = 100 * n_zero / n_rec if n_rec > 0 else 0 + area_str = str(row["area"]) lines.append( - f"{row['state']:<4} {row['status']:<14} {hit:>5} {tot:>5} " + f"{area_str:<{id_w}} {row['status']:<14} {hit:>5} {tot:>5} " f"{viol:>5} {me:>8.4f} {mx:>8.4f} " f"{rmse:>7.3f} {p5:>7.3f} {med:>7.3f} " f"{p95:>7.3f} {wmax:>8.1f} {pct_zero:>5.1f}%" ) + if not show_all: + n_omitted = n_areas - len(display_df) + lines.append(f" ... {n_omitted} areas omitted (all within tolerance)") lines.append("") # Violated targets by variable @@ -357,7 +618,7 @@ def generate_report(areas=None, weight_dir=None): abs_miss = abs(v["achieved"] - v["target"]) all_violated.append( { - "state": row["state"], + "area": row["area"], "varname": varname, "cnt_type": cnt_type, "pct_err": v["pct_err"], @@ -372,16 +633,15 @@ def generate_report(areas=None, weight_dir=None): var_counts = vdf["varname"].value_counts() lines.append("VIOLATIONS BY VARIABLE:") for var, cnt in var_counts.items(): - states_with = sorted(vdf[vdf["varname"] == var]["state"].unique()) + areas_with = sorted(vdf[vdf["varname"] == var]["area"].unique()) lines.append( - f" {var}: {cnt} violations across " - f"{len(states_with)} states" + f" {var}: {cnt} violations across " f"{len(areas_with)} areas" ) lines.append("") - state_counts = vdf["state"].value_counts().head(10) - lines.append("STATES WITH MOST VIOLATIONS:") - for st, cnt in state_counts.items(): + area_counts = vdf["area"].value_counts().head(10) + lines.append("AREAS WITH MOST VIOLATIONS:") + for st, cnt in area_counts.items(): lines.append(f" {st}: {cnt} violated") lines.append("") @@ -394,7 +654,7 @@ def generate_report(areas=None, weight_dir=None): else: for _, r in amt_viol.head(5).iterrows(): lines.append( - f" {r['state']:<4} {r['pct_err']:.3f}% " + f" {r['area']:<4} {r['pct_err']:.3f}% " f"target=${r['target']:>15,.0f} " f"achieved=${r['achieved']:>15,.0f} " f"miss=${r['abs_miss']:>12,.0f} " @@ -411,7 +671,7 @@ def generate_report(areas=None, weight_dir=None): else: for _, r in cnt_viol.head(5).iterrows(): lines.append( - f" {r['state']:<4} {r['pct_err']:.3f}% " + f" {r['area']:<4} {r['pct_err']:.3f}% " f"target={r['target']:>12,.0f} " f"achieved={r['achieved']:>12,.0f} " f"miss={r['abs_miss']:>8,.0f} " @@ -419,28 +679,37 @@ def generate_report(areas=None, weight_dir=None): ) lines.append("") - # Weight diagnostics - lines.extend(_weight_diagnostics(areas, weight_dir)) + # Weight diagnostics (uses pre-loaded data if available) + if report_data is not None: + tmd, s006, state_weights, n_loaded = report_data + lines.extend( + _weight_diagnostics( + areas, + weight_dir, + target_dir, + tmd, + s006, + state_weights, + n_loaded, + ) + ) report = "\n".join(lines) return report -def _weight_diagnostics(areas, weight_dir=None): +def _load_report_data(areas, weight_dir): """ - Combined weight diagnostics: exhaustion + national aggregation. + Load TMD data and area weight files for diagnostics. - Loads TMD data and state weight files once, reuses for both. - Only reads the specific columns needed (not the full allvars). - """ - if weight_dir is None: - weight_dir = STATE_WEIGHT_DIR + Called once from generate_report(); results passed to all + diagnostic functions to avoid reloading. + Returns (tmd, s006, state_weights, n_loaded) or None if + no weight files found. + """ from tmd.storage import STORAGE_FOLDER - lines = [] - - # Load national data tmd_path = STORAGE_FOLDER / "output" / "tmd.csv.gz" tmd_cols = [ "RECID", @@ -455,6 +724,7 @@ def _weight_diagnostics(areas, weight_dir=None): "e00650", "e00900", "e01400", + "e01500", "e01700", "e02000", "e02300", @@ -468,7 +738,6 @@ def _weight_diagnostics(areas, weight_dir=None): ] tmd = pd.read_csv(tmd_path, usecols=tmd_cols) tmd["capgains_net"] = tmd["p22250"] + tmd["p23250"] - n_records = len(tmd) s006 = tmd["s006"].values agi_path = STORAGE_FOLDER / "output" / "cached_c00100.npy" @@ -495,7 +764,7 @@ def _weight_diagnostics(areas, weight_dir=None): for col in load_tc: tmd[col] = allvars[col].values - # Load all state weights once + n_records = len(tmd) weight_sum = np.zeros(n_records) state_weights = {} for st in areas: @@ -508,17 +777,151 @@ def _weight_diagnostics(areas, weight_dir=None): n_loaded = len(state_weights) if n_loaded == 0: + return None + return tmd, s006, state_weights, n_loaded + + +def _weight_distribution_by_stub( + tmd, s006, state_weights, n_loaded, target_dir +): + """ + Show weighted return counts and AGI by AGI stub: + national (old) vs sum-of-areas (new) and change. + """ + lines = [] + + if "c00100" not in tmd.columns: + return lines + + # Get AGI bins from target file + first_area = next(iter(state_weights.keys()), None) + if first_area is None: + return lines + tgt_path = target_dir / f"{first_area}_targets.csv" + if not tgt_path.exists(): + return lines + tgt_df = pd.read_csv(tgt_path, comment="#") + + agi_pairs = set() + for _, row in tgt_df.iterrows(): + lo, hi = float(row.agilo), float(row.agihi) + if lo < -1e10 and hi > 1e10: + continue + agi_pairs.add((lo, hi)) + agi_bins = sorted(agi_pairs) + if not agi_bins: return lines + c00100 = tmd["c00100"].values + lines.append(f"WEIGHT DISTRIBUTION BY AGI STUB ({n_loaded} areas):") + lines.append( + " National = s006-weighted totals." + " Sum-of-Areas = area-weight-weighted totals." + ) + + # Header + lines.append( + f" {'AGI Stub':<18}" + f" {'Natl Returns':>14} {'Area Returns':>14} {'Chg%':>7}" + f" {'Natl AGI ($B)':>14} {'Area AGI ($B)':>14} {'Chg%':>7}" + ) + lines.append(" " + "-" * 92) + + # Running totals + tot_nat_ret = 0.0 + tot_area_ret = 0.0 + tot_nat_agi = 0.0 + tot_area_agi = 0.0 + + for lo, hi in agi_bins: + in_bin = (c00100 >= lo) & (c00100 < hi) + mask = in_bin.astype(float) + + # Returns + nat_ret = float((s006 * mask).sum()) + area_ret = sum(float((w * mask).sum()) for w in state_weights.values()) + ret_chg = (area_ret / nat_ret - 1) * 100 if nat_ret else 0 + + # AGI + agi_vals = c00100 * mask + nat_agi = float((s006 * agi_vals).sum()) + area_agi = sum( + float((w * agi_vals).sum()) for w in state_weights.values() + ) + agi_chg = (area_agi / nat_agi - 1) * 100 if nat_agi else 0 + + tot_nat_ret += nat_ret + tot_area_ret += area_ret + tot_nat_agi += nat_agi + tot_area_agi += area_agi + + bin_label = _fmt_agi_bin(lo, hi) + flag = " *" if abs(ret_chg) > 2 or abs(agi_chg) > 2 else "" + lines.append( + f" {bin_label:<18}" + f" {nat_ret:>14,.0f} {area_ret:>14,.0f}" + f" {ret_chg:>+6.2f}%" + f" {nat_agi / 1e9:>14.1f} {area_agi / 1e9:>14.1f}" + f" {agi_chg:>+6.2f}%{flag}" + ) + + # Total row + tot_ret_chg = (tot_area_ret / tot_nat_ret - 1) * 100 if tot_nat_ret else 0 + tot_agi_chg = (tot_area_agi / tot_nat_agi - 1) * 100 if tot_nat_agi else 0 + lines.append(" " + "-" * 92) + lines.append( + f" {'TOTAL':<18}" + f" {tot_nat_ret:>14,.0f} {tot_area_ret:>14,.0f}" + f" {tot_ret_chg:>+6.2f}%" + f" {tot_nat_agi / 1e9:>14.1f} {tot_area_agi / 1e9:>14.1f}" + f" {tot_agi_chg:>+6.2f}%" + ) + + n_flagged = 0 + for lo, hi in agi_bins: + in_bin = (c00100 >= lo) & (c00100 < hi) + mask = in_bin.astype(float) + nat_ret = float((s006 * mask).sum()) + area_ret = sum(float((w * mask).sum()) for w in state_weights.values()) + ret_chg = (area_ret / nat_ret - 1) * 100 if nat_ret else 0 + agi_vals = c00100 * mask + nat_agi = float((s006 * agi_vals).sum()) + area_agi = sum( + float((w * agi_vals).sum()) for w in state_weights.values() + ) + agi_chg = (area_agi / nat_agi - 1) * 100 if nat_agi else 0 + if abs(ret_chg) > 2 or abs(agi_chg) > 2: + n_flagged += 1 + if n_flagged: + lines.append(f" * = {n_flagged} stubs with >2% change") + lines.append("") + + return lines + + +def _weight_diagnostics( + _areas, _weight_dir, target_dir, tmd, s006, state_weights, n_loaded +): + """ + Combined weight diagnostics: exhaustion + national aggregation. + + Uses pre-loaded TMD data and weight files (from _load_report_data). + """ + lines = [] + n_records = len(tmd) + # Weight exhaustion + weight_sum = np.zeros(n_records) + for w in state_weights.values(): + weight_sum += w usage = weight_sum / s006 - _sub = "sum of state weights / national weight" + _sub = "sum of area weights / national weight" lines.append(f"WEIGHT EXHAUSTION ({_sub}):") lines.append( f" A ratio of 1.0 means the record's national" f" weight is fully allocated across" - f" {n_loaded} states." + f" {n_loaded} areas." ) pcts = [0, 1, 5, 10, 25, 50, 75, 90, 95, 99, 100] quantiles = np.percentile(usage, pcts) @@ -575,8 +978,7 @@ def _weight_diagnostics(areas, weight_dir=None): f" ptshp=${r.e26270:,.0f}" ) lines.append( - f" top states: {top3}" - f" ({len(st_wts)} nonzero of {n_loaded})" + f" top areas: {top3}" f" ({len(st_wts)} nonzero of {n_loaded})" ) lines.append("") @@ -611,37 +1013,49 @@ def _weight_diagnostics(areas, weight_dir=None): state_sums[var] += float((w * tmd[var].values).sum()) lines.append( - f"CROSS-STATE AGGREGATION vs NATIONAL TOTALS" - f" for SELECTED VARIABLES ({n_loaded} states):" + f"CROSS-AREA AGGREGATION vs NATIONAL TOTALS" + f" for SELECTED VARIABLES ({n_loaded} areas):" + ) + lines.append( + " Do area weights preserve national totals? Diff% near 0 = good." ) lines.append( f" {'Variable':<30} {'National':>16}" - f" {'Sum-of-States':>16} {'Diff%':>8}" + f" {'Sum-of-Areas':>16} {'Diff':>14} {'Diff%':>8}" ) - lines.append(" " + "-" * 72) + lines.append(" " + "-" * 86) for label, var, is_count in check_vars: nat = national[var] sos = state_sums[var] if nat is None or nat == 0: continue + diff = sos - nat diff_pct = (sos / nat - 1) * 100 if is_count: lines.append( f" {label:<30} {nat:>16,.0f}" - f" {sos:>16,.0f} {diff_pct:>+7.2f}%" + f" {sos:>16,.0f}" + f" {diff:>+14,.0f}" + f" {diff_pct:>+7.2f}%" ) else: lines.append( f" {label:<30}" f" ${nat / 1e9:>14.1f}B" f" ${sos / 1e9:>14.1f}B" + f" ${diff / 1e9:>+12.2f}B" f" {diff_pct:>+7.2f}%" ) lines.append("") - # Bystander check: untargeted variables + # Bystander check: untargeted variables (aggregate) lines.extend(_bystander_check(tmd, s006, state_weights, n_loaded)) + # Bystander check: per-bin analysis for targeted + key untargeted vars + lines.extend( + _bystander_by_bin(tmd, s006, state_weights, n_loaded, target_dir) + ) + return lines @@ -687,12 +1101,7 @@ def _bystander_check(tmd, s006, state_weights, n_loaded): for label, var, is_count in bystander_vars: if var not in tmd.columns: continue - if var == "XTOT": - nat = float((s006 * tmd[var].values).sum()) - elif is_count: - nat = float((s006 * tmd[var].values).sum()) - else: - nat = float((s006 * tmd[var].values).sum()) + nat = float((s006 * tmd[var].values).sum()) if abs(nat) < 1: continue sos = 0.0 @@ -705,7 +1114,7 @@ def _bystander_check(tmd, s006, state_weights, n_loaded): results.sort(key=lambda x: -abs(x[5])) lines.append( - "BYSTANDER CHECK: UNTARGETED VARIABLES" f" ({n_loaded} states):" + "BYSTANDER CHECK: UNTARGETED VARIABLES" f" ({n_loaded} areas):" ) lines.append( " Variables NOT directly targeted — distortion" @@ -713,15 +1122,17 @@ def _bystander_check(tmd, s006, state_weights, n_loaded): ) lines.append( f" {'Variable':<30} {'National':>16}" - f" {'Sum-of-States':>16} {'Diff%':>8}" + f" {'Sum-of-Areas':>16} {'Diff':>14} {'Diff%':>8}" ) - lines.append(" " + "-" * 72) + lines.append(" " + "-" * 86) for label, _var, is_count, nat, sos, diff_pct in results: + diff = sos - nat flag = " ***" if abs(diff_pct) > 2 else "" if is_count: lines.append( f" {label:<30} {nat:>16,.0f}" f" {sos:>16,.0f}" + f" {diff:>+14,.0f}" f" {diff_pct:>+7.2f}%{flag}" ) else: @@ -729,6 +1140,7 @@ def _bystander_check(tmd, s006, state_weights, n_loaded): f" {label:<30}" f" ${nat / 1e9:>14.1f}B" f" ${sos / 1e9:>14.1f}B" + f" ${diff / 1e9:>+12.2f}B" f" {diff_pct:>+7.2f}%{flag}" ) @@ -748,24 +1160,283 @@ def _bystander_check(tmd, s006, state_weights, n_loaded): return lines +def _bystander_by_bin(tmd, s006, state_weights, n_loaded, target_dir): + """ + Per-AGI-bin bystander analysis. + + For both targeted and untargeted variable-bin combinations, + computes cross-area aggregation distortion. Shows whether + dropped targets (variable-bin combos excluded from the recipe) + are still well-behaved or drifting. + """ + lines = [] + + # Read one target file to identify what's targeted + first_area = next(iter(state_weights.keys()), None) + if first_area is None: + return lines + tgt_path = target_dir / f"{first_area}_targets.csv" + if not tgt_path.exists(): + return lines + tgt_df = pd.read_csv(tgt_path, comment="#") + + # Build set of targeted (varname, count, agilo, agihi, fstatus) + targeted_set = set() + for _, row in tgt_df.iterrows(): + targeted_set.add( + ( + row.varname, + int(row["count"]), + float(row.agilo), + float(row.agihi), + int(row.fstatus), + ) + ) + + # Get AGI bins from target file (exclude the all-bins row) + agi_pairs = set() + for _, row in tgt_df.iterrows(): + lo, hi = float(row.agilo), float(row.agihi) + if lo < -1e10 and hi > 1e10: + continue # skip all-bins + agi_pairs.add((lo, hi)) + agi_bins = sorted(agi_pairs) + + if not agi_bins or "c00100" not in tmd.columns: + return lines + + # Pre-compute AGI bin masks + agi_masks = {} + for lo, hi in agi_bins: + agi_masks[(lo, hi)] = (tmd["c00100"].values >= lo) & ( + tmd["c00100"].values < hi + ) + + # Variables to check: recipe vars + key untargeted + # (label, varname, count_type) + check_vars = [ + # Targeted amount variables + ("AGI", "c00100", 0), + ("Wages", "e00200", 0), + ("Interest", "e00300", 0), + ("Pensions tot", "e01500", 0), + ("Social Security", "e02400", 0), + ("SALT ded", "c18300", 0), + ("Partnership", "e26270", 0), + # Count targets + ("Returns", "c00100", 1), + ("Wage nz-count", "e00200", 2), + # Key untargeted (for bystander effect) + ("Income tax", "iitax", 0), + ("Cap gains net", "capgains_net", 0), + ("Sch C income", "e00900", 0), + ("IRA distrib", "e01400", 0), + ("Pensions txbl", "e01700", 0), + ("Sch E net", "e02000", 0), + ("Tax-exempt int", "e00400", 0), + ] + + results = [] + for label, varname, cnt in check_vars: + if varname not in tmd.columns: + continue + var_vals = tmd[varname].values + + for lo, hi in agi_bins: + in_bin = agi_masks[(lo, hi)] + + # Build the value array for this variable-bin-count combo + if cnt == 0: + vals = var_vals * in_bin + elif cnt == 1: + vals = in_bin.astype(float) + elif cnt == 2: + vals = ((var_vals != 0) & in_bin).astype(float) + else: + continue + + # National total + nat = float((s006 * vals).sum()) + if abs(nat) < 1: + continue + + # Sum-of-areas total + sos = 0.0 + for w in state_weights.values(): + sos += float((w * vals).sum()) + diff_pct = (sos / nat - 1) * 100 + + # Check if this combo is targeted (fstatus=0 for amounts/counts) + is_targeted = (varname, cnt, lo, hi, 0) in targeted_set + + results.append( + { + "label": label, + "varname": varname, + "cnt": cnt, + "lo": lo, + "hi": hi, + "nat": nat, + "sos": sos, + "diff_pct": diff_pct, + "targeted": is_targeted, + } + ) + + if not results: + return lines + + # Sort by absolute distortion + results.sort(key=lambda x: -abs(x["diff_pct"])) + + # Show top 30 by distortion + n_show = min(30, len(results)) + n_total = len(results) + cnt_labels = {0: "amt", 1: "returns", 2: "nz-count"} + lines.append(f"PER-BIN DISTORTION ANALYSIS ({n_loaded} areas):") + lines.append( + f" T = targeted, U = untargeted." + f" Sorted by |distortion|," + f" top {n_show} out of {n_total} combinations." + ) + col_hdr = ( + f" {'':>1} {'Variable':<18} {'Type':<10}" + f" {'AGI Bin':<16}" + f" {'National':>14} {'Sum-Areas':>14}" + f" {'Diff':>14} {'Diff%':>8}" + ) + lines.append(col_hdr) + lines.append(" " + "-" * (len(col_hdr) - 2)) + + for r in results[:n_show]: + mark = "T" if r["targeted"] else "U" + ct_label = cnt_labels.get(r["cnt"], f"cnt{r['cnt']}") + bin_label = _fmt_agi_bin(r["lo"], r["hi"]) + flag = " ***" if abs(r["diff_pct"]) > 5 else "" + diff = r["sos"] - r["nat"] + + if r["cnt"] == 0: + nat_s = f"${r['nat'] / 1e9:.2f}B" + sos_s = f"${r['sos'] / 1e9:.2f}B" + diff_s = f"${diff / 1e9:+.2f}B" + else: + nat_s = f"{r['nat']:,.0f}" + sos_s = f"{r['sos']:,.0f}" + diff_s = f"{diff:+,.0f}" + + lines.append( + f" {mark} {r['label']:<18} {ct_label:<10}" + f" {bin_label:<16}" + f" {nat_s:>14} {sos_s:>14}" + f" {diff_s:>14}" + f" {r['diff_pct']:>+7.2f}%{flag}" + ) + + # Summary + untargeted = [r for r in results if not r["targeted"]] + targeted_results = [r for r in results if r["targeted"]] + lines.append("") + lines.append( + f" Targeted: {len(targeted_results)}" + f" Untargeted: {len(untargeted)}" + ) + if untargeted: + worst_d = max(abs(r["diff_pct"]) for r in untargeted) + mean_d = np.mean([abs(r["diff_pct"]) for r in untargeted]) + n_bad = sum(1 for r in untargeted if abs(r["diff_pct"]) > 5) + lines.append( + f" Untargeted distortion:" + f" mean={mean_d:.2f}%, worst={worst_d:.2f}%" + ) + if n_bad: + lines.append( + f" *** = {n_bad} untargeted combos" f" with >5% distortion" + ) + else: + lines.append(" All untargeted combos within 5% distortion.") + if targeted_results: + worst_t = max(abs(r["diff_pct"]) for r in targeted_results) + mean_t = np.mean([abs(r["diff_pct"]) for r in targeted_results]) + lines.append( + f" Targeted-bin distortion:" + f" mean={mean_t:.2f}%, worst={worst_t:.2f}%" + ) + lines.append("") + + return lines + + def main(): parser = argparse.ArgumentParser( - description="Cross-state quality summary report", + description="Cross-area quality summary report", ) parser.add_argument( "--scope", default=None, - help="Comma-separated state codes (default: all states)", + help=( + "'states', 'cds', or comma-separated area codes" + " (default: all states)" + ), + ) + parser.add_argument( + "--output", + "-o", + nargs="?", + const="auto", + default=None, + help=( + "Save report to file. No argument = auto-generate" + " filename in weight directory. Or specify a path." + ), ) args = parser.parse_args() areas = None - if args.scope and args.scope.lower() != "states": - areas = [s.strip().upper() for s in args.scope.split(",")] + weight_dir = None + target_dir = None + scope_label = "states" + if args.scope: + scope_lower = args.scope.lower().strip() + if scope_lower == "cds": + weight_dir = CD_WEIGHT_DIR + target_dir = CD_TARGET_DIR + scope_label = "cds" + elif scope_lower == "states": + weight_dir = STATE_WEIGHT_DIR + target_dir = STATE_TARGET_DIR + scope_label = "states" + else: + codes = [s.strip().upper() for s in args.scope.split(",")] + # Detect CDs vs states by code length + if codes and len(codes[0]) > 2: + weight_dir = CD_WEIGHT_DIR + target_dir = CD_TARGET_DIR + scope_label = f"cds ({len(codes)} selected)" + else: + scope_label = f"states ({len(codes)} selected)" + areas = codes - report = generate_report(areas) + if weight_dir is None: + weight_dir = STATE_WEIGHT_DIR + target_dir = STATE_TARGET_DIR + + report = generate_report( + areas, + weight_dir=weight_dir, + target_dir=target_dir, + scope_label=scope_label, + ) print(report) + # Save to file + if args.output is not None: + if args.output == "auto": + output_path = weight_dir / "quality_report.txt" + else: + output_path = Path(args.output) + output_path.write_text(report, encoding="utf-8") + print(f"\nReport saved to: {output_path}") + if __name__ == "__main__": main()