Skip to content

Commit a7ad94a

Browse files
Merge pull request #3 from PolicyEngine/lcfs-income-weights-spi-was-2026
Fix LCFS income/weights, add --uprate-to flag
2 parents b4e0717 + acc6a7e commit a7ad94a

4 files changed

Lines changed: 66 additions & 5 deletions

File tree

SKILL.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ For microdata (per-entity DataFrames): use `--output-microdata-stdout` and parse
3838
| `--output json` | Machine-readable aggregate output |
3939
| `--output-microdata-stdout` | Per-entity CSVs to stdout |
4040
| `--export-params-json` | Dump baseline parameters |
41+
| `--uprate-to YYYY` | With `--extract`: uprate dataset to target year before writing clean CSVs |
4142

4243
## Data
4344

@@ -95,6 +96,25 @@ Four raw survey inputs are supported. All use the same two-step flow: `--extract
9596

9697
**UKDS data**: LCFS (SN 9468), WAS (SN 7215), SPI (SN 9422) are all under project `ecf0b3c4-29d2-4d8a-931d-0e3773a4ac0b`. Download tab zips from UKDS MCP and unzip before extracting.
9798

99+
## Versioning and releasing
100+
101+
Versions are managed via `pyproject.toml` (the source of truth) and towncrier-style changelog fragments in `changelog.d/`.
102+
103+
- **Do not** edit `CHANGELOG.md` or `Cargo.toml` versions directly — they are updated automatically by CI.
104+
- To ship a change, drop a fragment file in `changelog.d/` with the naming convention `<slug>.<type>`:
105+
106+
| File suffix | Semver bump |
107+
|---|---|
108+
| `.fixed` | patch |
109+
| `.changed` | patch |
110+
| `.added` | minor |
111+
| `.removed` | minor |
112+
| `.breaking` | major |
113+
114+
Example: `changelog.d/parse-id-list-delimiters.fixed`
115+
116+
The content of the file is the human-readable changelog entry. CI runs `.github/bump_version.py` to infer the bump from fragment types, update `pyproject.toml`, then `publish-git-tag.sh` to tag and release.
117+
98118
## Building
99119

100120
```
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Fix LCFS income columns and weights; add --uprate-to flag; generate 2026/27 clean data for FRS, LCFS, SPI, and WAS.
2+
3+
LCFS loader: switch employment income to wkgrossp (weekly gross pay, well-populated), add p047p for main SE income, add p048p for investment income, and rescale weighta to UK household population (~28.3m) so weighted aggregates are correct.
4+
5+
Add --uprate-to flag to --extract mode, allowing raw survey data to be extracted and uprated to a target fiscal year in one step (e.g. --frs raw/ --year 2023 --uprate-to 2026 --extract data/frs/2026/).
6+
7+
Update SKILL.md to document --uprate-to and the UKDS project ID for LCFS/WAS/SPI downloads.

src/data/clean.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,32 @@ fn parse_id_list(s: &str) -> Vec<usize> {
687687
if s.is_empty() {
688688
return Vec::new();
689689
}
690-
s.split(';').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
690+
s.split(|c| c == ';' || c == ',').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
691+
}
692+
693+
#[cfg(test)]
694+
mod tests {
695+
use super::parse_id_list;
696+
697+
#[test]
698+
fn parse_id_list_semicolons() {
699+
assert_eq!(parse_id_list("0;1;2"), vec![0, 1, 2]);
700+
}
701+
702+
#[test]
703+
fn parse_id_list_commas() {
704+
assert_eq!(parse_id_list("0,1"), vec![0, 1]);
705+
}
706+
707+
#[test]
708+
fn parse_id_list_single() {
709+
assert_eq!(parse_id_list("3"), vec![3]);
710+
}
711+
712+
#[test]
713+
fn parse_id_list_empty() {
714+
assert_eq!(parse_id_list(""), Vec::<usize>::new());
715+
}
691716
}
692717

693718
fn parse_region(s: &str) -> Region {

src/data/lcfs.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
3737
let person_table = load_table_cols(data_dir, &person_file, Some(&[
3838
"case", "person",
3939
"a003", "a004", "a002", // age (two variants), sex
40-
"b303p", "b3262p", // employment income, self-employment income
40+
"wkgrossp", // weekly gross pay (employee, well-populated)
41+
"p047p", "b3262p", // SE income: main job, subsidiary job
42+
"p048p", // investment income (weekly)
4143
"b3381", "p049p", // state pension, private pension income
4244
]))?;
4345

@@ -48,13 +50,19 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
4850
persons_by_case.entry(case).or_default().push(row);
4951
}
5052

53+
// weighta is a design weight summing to roughly the sample size (~28,000-30,000).
54+
// Rescale to UK household population (~28.3m) so that weighted sums are population totals.
55+
let weighta_sum: f64 = hh_table.iter().map(|r| get_f64(r, "weighta").max(0.0)).sum();
56+
const UK_HOUSEHOLDS: f64 = 28_300_000.0;
57+
let weight_scale = if weighta_sum > 0.0 { UK_HOUSEHOLDS / weighta_sum } else { 1.0 };
58+
5159
let mut people = Vec::new();
5260
let mut benunits = Vec::new();
5361
let mut households = Vec::new();
5462

5563
for hh_row in &hh_table {
5664
let case = get_i64(hh_row, "case");
57-
let weight = get_f64(hh_row, "weighta");
65+
let weight = get_f64(hh_row, "weighta") * weight_scale;
5866
if weight <= 0.0 { continue; }
5967

6068
let region = region_from_gvtregno(get_i64(hh_row, "gorx"));
@@ -96,8 +104,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
96104
is_benunit_head: is_head,
97105
is_household_head: is_head,
98106
is_in_scotland: region.is_scotland(),
99-
employment_income: get_f64(prow, "b303p").max(0.0) * WEEKS_IN_YEAR,
100-
self_employment_income: get_f64(prow, "b3262p").max(0.0) * WEEKS_IN_YEAR,
107+
employment_income: get_f64(prow, "wkgrossp").max(0.0) * WEEKS_IN_YEAR,
108+
self_employment_income: (get_f64(prow, "p047p") + get_f64(prow, "b3262p")).max(0.0) * WEEKS_IN_YEAR,
109+
savings_interest_income: get_f64(prow, "p048p").max(0.0) * WEEKS_IN_YEAR,
101110
state_pension: get_f64(prow, "b3381").max(0.0) * WEEKS_IN_YEAR,
102111
pension_income: get_f64(prow, "p049p").max(0.0) * WEEKS_IN_YEAR,
103112
// Allocate total household benefit income to head as passthrough

0 commit comments

Comments
 (0)