Skip to content

Commit 8f4571c

Browse files
Add consumption and wealth taxes, parallel RF training, rebuild pipeline (#14)
Adds fuel duty, alcohol duty, tobacco duty, capital gains tax, stamp duty land tax, annual wealth tax (hypothetical) and reformable council tax. LCFS alcohol/tobacco split into separate fields. All new taxes feed into household total_tax and program breakdown. Parallelises EFRS Random Forest training/prediction with Rayon and reduces tree count to 50, cutting build time substantially. Adds scripts/rebuild_all.py to rebuild every clean dataset from raw UKDS files on GCS, plus year-fallback logic in the Python wrapper so non-FRS datasets work without per-year bucket uploads.
1 parent a5a9765 commit 8f4571c

23 files changed

Lines changed: 1057 additions & 79 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Consumption and wealth taxes: fuel duty, alcohol duty, tobacco duty, capital gains tax, stamp duty land tax, annual wealth tax (hypothetical), and reformable council tax. LCFS alcohol/tobacco spending split into separate fields for accurate duty modelling. All new taxes included in household total_tax and program breakdown output.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
`policyengine_uk_compiled.data` now falls back to the nearest available year on the bucket when the requested year is missing, letting the Rust engine uprate forward at runtime. Non-FRS datasets (SPI, LCFS, WAS, EFRS) are now usable from the Python wrapper without any per-year bucket upload.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
`scripts/rebuild_all.py` rebuilds every clean dataset from raw UKDS files on GCS. Downloads raw tab files from `ukds/<dataset>/<ref>/`, runs Rust extraction, and uploads clean CSVs back. Supports filtering with `--only` and `--year`. Companion raw files uploaded to `gs://policyengine-uk-microdata/ukds/` for FRS 2022/23 and 2023/24, LCFS 2019/20, 2021/22, 2022/23, SPI 2021/22 and 2022/23, and WAS rounds 7 and 8.

changelog.d/added/rf-parallel.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Parallelised EFRS Random Forest training and prediction with Rayon. The 28 wealth and consumption RF models now train in parallel across cores and share a single DenseMatrix, cutting EFRS build time substantially. Tree count reduced from 100 to 50, which keeps accuracy while halving per-model cost.

interfaces/python/policyengine_uk_compiled/data.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,20 +71,86 @@ def _get_credentials() -> tuple[str, str]:
7171
DATASETS = ("frs", "efrs", "lcfs", "spi", "was")
7272

7373

74+
def _list_available_years(dataset: str, access_key: str, secret_key: str) -> list[int]:
75+
"""List years available on the bucket for a given dataset.
76+
77+
Returns a sorted list of integer years found under the `<dataset>/` prefix.
78+
"""
79+
import re
80+
keys = []
81+
marker = ""
82+
while True:
83+
path = f"/?prefix={dataset}/&marker={marker}"
84+
headers = _sign_request("GET", "/", access_key, secret_key)
85+
url = f"https://{GCS_HOST}/{GCS_BUCKET}{path}"
86+
req = urllib.request.Request(url, headers=headers)
87+
with urllib.request.urlopen(req) as resp:
88+
body = resp.read().decode()
89+
found = re.findall(r"<Key>([^<]+)</Key>", body)
90+
if not found:
91+
break
92+
keys.extend(found)
93+
if "<IsTruncated>true</IsTruncated>" not in body:
94+
break
95+
marker = found[-1]
96+
97+
years = set()
98+
year_re = re.compile(rf"^{dataset}/(\d{{4}})/")
99+
for key in keys:
100+
m = year_re.match(key)
101+
if m:
102+
years.add(int(m.group(1)))
103+
return sorted(years)
104+
105+
106+
def _pick_nearest_year(available: list[int], requested: int) -> int:
107+
"""Pick the nearest year to requested from available.
108+
109+
Prefers the latest year ≤ requested (so uprating moves forward), falling
110+
back to the earliest available year if none is ≤ requested.
111+
"""
112+
if not available:
113+
raise FileNotFoundError("No years available on bucket")
114+
candidates = [y for y in available if y <= requested]
115+
if candidates:
116+
return max(candidates)
117+
return min(available)
118+
119+
74120
def ensure_dataset_year(dataset: str, year: int) -> Path:
75121
"""Ensure clean CSVs for a dataset/year are available locally, downloading if needed.
76122
77-
Returns the path to the year directory (e.g. ~/.policyengine-uk-data/frs/2026/).
123+
If the requested year isn't on the bucket, downloads the nearest available
124+
year and returns its directory. The Rust engine handles uprating from the
125+
downloaded year to the requested year at run time.
126+
127+
Returns the path to the year directory actually downloaded (may differ from
128+
the requested year).
78129
"""
79130
year_dir = LOCAL_CACHE / dataset / str(year)
80131
expected_files = ["persons.csv", "benunits.csv", "households.csv"]
81132
if all((year_dir / f).exists() for f in expected_files):
82133
return year_dir
83134

84135
access_key, secret_key = _get_credentials()
136+
137+
# Determine which year to download. If the requested year isn't on the
138+
# bucket, fall back to the nearest available.
139+
available = _list_available_years(dataset, access_key, secret_key)
140+
# If we already cached the nearest year locally, use that.
141+
if available:
142+
download_year = _pick_nearest_year(available, year)
143+
if download_year != year:
144+
near_dir = LOCAL_CACHE / dataset / str(download_year)
145+
if all((near_dir / f).exists() for f in expected_files):
146+
return near_dir
147+
year_dir = near_dir
148+
else:
149+
download_year = year
150+
85151
year_dir.mkdir(parents=True, exist_ok=True)
86152
for f in expected_files:
87-
key = f"{dataset}/{year}/{f}"
153+
key = f"{dataset}/{download_year}/{f}"
88154
dest = year_dir / f
89155
if dest.exists():
90156
continue

parameters/2025_26.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,55 @@ vat:
214214
reduced_rate: 0.05
215215
zero_rate: 0.0
216216

217+
fuel_duty:
218+
# Hydrocarbon Oil Duties Act 1979 s.6; SI 2022/269 (5p cut); extended to 2025/26
219+
# 52.95p/litre on unleaded petrol and diesel
220+
petrol_rate_per_litre: 0.5295
221+
diesel_rate_per_litre: 0.5295
222+
# Average pump prices (BEIS weekly fuel prices, Q1 2025/26 average)
223+
average_petrol_price_per_litre: 1.35
224+
average_diesel_price_per_litre: 1.40
225+
226+
alcohol_duty:
227+
# Alcoholic Liquor Duties Act 1979, reformed by Finance (No. 2) Act 2023 s.46-88
228+
# OBR 2025/26: £11.9bn revenue from ~£30bn household alcohol spending
229+
effective_rate: 0.40
230+
231+
tobacco_duty:
232+
# Tobacco Products Duty Act 1979; escalator RPI + 2% (Finance Act 2024)
233+
# OBR 2025/26: £8bn revenue from ~£11bn household tobacco spending
234+
effective_rate: 0.72
235+
236+
council_tax:
237+
# Local Government Finance Act 1992 s.1-5
238+
# DLUHC Council Tax levels statistics 2025/26: England average Band D = £2,280
239+
average_band_d: 2280.0
240+
241+
capital_gains_tax:
242+
# Taxation of Chargeable Gains Act 1992; Finance Act 2024 s.7 (rate increases)
243+
# AEA reduced to £3,000 from 2024/25 (Finance Act 2023 s.4)
244+
annual_exempt_amount: 3000.0
245+
basic_rate: 0.18
246+
higher_rate: 0.24
247+
realisation_rate: 0.50
248+
249+
stamp_duty:
250+
# Finance Act 2003 s.55, as amended; bands from 1 April 2025
251+
bands:
252+
- { rate: 0.0, threshold: 0 }
253+
- { rate: 0.02, threshold: 125001 }
254+
- { rate: 0.05, threshold: 250001 }
255+
- { rate: 0.10, threshold: 925001 }
256+
- { rate: 0.12, threshold: 1500001 }
257+
annual_purchase_probability: 0.043 # ~1/23 year average holding period
258+
259+
wealth_tax:
260+
# Hypothetical — no current UK legislation. Disabled by default.
261+
# Wealth Tax Commission (2020) proposed 1% above £10m.
262+
enabled: false
263+
threshold: 10000000.0
264+
rate: 0.01
265+
217266
growth_factors:
218267
# OBR Economic and Fiscal Outlook, March 2026
219268
# Table 1.7 (Inflation) and Table 1.6 (Labour Market)

scripts/rebuild_all.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
"""Rebuild every clean dataset from raw UKDS files held on GCS.
2+
3+
Pipeline per job: download raw tab files from gs://policyengine-uk-microdata/ukds/
4+
→ run the Rust extraction → upload clean CSVs to gs://policyengine-uk-microdata/<dataset>/<year>/.
5+
6+
Assumes:
7+
- `gcloud storage` CLI is authenticated and can read/write the bucket.
8+
- `cargo` is on PATH and the workspace builds cleanly.
9+
10+
Usage:
11+
python scripts/rebuild_all.py # rebuild everything
12+
python scripts/rebuild_all.py --only lcfs # rebuild just LCFS years
13+
python scripts/rebuild_all.py --only frs --year 2023
14+
python scripts/rebuild_all.py --only efrs # rebuild EFRS for all FRS years we have
15+
python scripts/rebuild_all.py --work-dir /tmp/pe # use a fixed working dir (cached)
16+
python scripts/rebuild_all.py --keep # keep the working dir after running
17+
"""
18+
19+
from __future__ import annotations
20+
21+
import argparse
22+
import os
23+
import shutil
24+
import subprocess
25+
import sys
26+
import tempfile
27+
from dataclasses import dataclass
28+
from pathlib import Path
29+
30+
BUCKET = "gs://policyengine-uk-microdata"
31+
RAW_PREFIX = f"{BUCKET}/ukds"
32+
REPO_ROOT = Path(__file__).resolve().parent.parent
33+
34+
# Extra search paths for gcloud/cargo that might not be on the default subprocess PATH.
35+
_EXTRA_PATHS = [
36+
Path.home() / ".cargo" / "bin",
37+
Path.home() / "Downloads" / "google-cloud-sdk" / "bin",
38+
Path("/opt/homebrew/bin"),
39+
Path("/usr/local/bin"),
40+
]
41+
for _p in _EXTRA_PATHS:
42+
if _p.is_dir() and str(_p) not in os.environ.get("PATH", ""):
43+
os.environ["PATH"] = f"{_p}:{os.environ.get('PATH', '')}"
44+
45+
46+
def _require(tool: str) -> None:
47+
if shutil.which(tool) is None:
48+
raise SystemExit(
49+
f"{tool!r} not found on PATH. Install it or add it to PATH before running."
50+
)
51+
52+
53+
@dataclass
54+
class ExtractJob:
55+
"""One raw survey → clean CSV extraction."""
56+
dataset: str # frs | lcfs | spi | was
57+
year: int # target fiscal year for the clean output directory
58+
raw_ref: str # path under ukds/ (e.g. "frs/2023", "was/round_7")
59+
rust_flag: str # --frs | --lcfs | --spi | --was
60+
61+
62+
# Manifest of everything we can rebuild. Extend as new raw years arrive on the bucket.
63+
JOBS: list[ExtractJob] = [
64+
ExtractJob("frs", 2022, "frs/2022", "--frs"),
65+
ExtractJob("frs", 2023, "frs/2023", "--frs"),
66+
ExtractJob("lcfs", 2019, "lcfs/2019", "--lcfs"),
67+
ExtractJob("lcfs", 2021, "lcfs/2021", "--lcfs"),
68+
ExtractJob("lcfs", 2022, "lcfs/2022", "--lcfs"),
69+
ExtractJob("spi", 2021, "spi/2021", "--spi"),
70+
ExtractJob("spi", 2022, "spi/2022", "--spi"),
71+
ExtractJob("was", 2020, "was/round_7", "--was"),
72+
ExtractJob("was", 2022, "was/round_8", "--was"),
73+
]
74+
75+
# EFRS pipeline: (fiscal_year, frs_year, was_ref, lcfs_ref)
76+
# Picks the raw references it composes from.
77+
EFRS_JOBS: list[tuple[int, int, str, str]] = [
78+
(2023, 2023, "was/round_7", "lcfs/2021"),
79+
]
80+
81+
82+
def run(cmd: list, cwd: Path | None = None) -> None:
83+
print(f" $ {' '.join(str(c) for c in cmd)}", flush=True)
84+
subprocess.run([str(c) for c in cmd], cwd=cwd, check=True)
85+
86+
87+
def gcs_copy_in(ref: str, dest: Path) -> None:
88+
"""Download everything under ukds/<ref>/ into dest/."""
89+
dest.mkdir(parents=True, exist_ok=True)
90+
# gcloud storage cp -r copies the listed objects verbatim.
91+
run(["gcloud", "storage", "cp", "-r", f"{RAW_PREFIX}/{ref}/*", str(dest)])
92+
93+
94+
def gcs_copy_out(local_dir: Path, dataset: str, year: int) -> None:
95+
dest = f"{BUCKET}/{dataset}/{year}/"
96+
# Upload the three clean CSVs only; ignore any stray files.
97+
files = sorted(local_dir.glob("*.csv"))
98+
if not files:
99+
raise SystemExit(f"No CSV files in {local_dir}; extraction probably failed")
100+
run(["gcloud", "storage", "cp", *[str(f) for f in files], dest])
101+
102+
103+
def ensure_raw(ref: str, work: Path) -> Path:
104+
"""Download raw ukds/<ref> to work/raw/<ref>, caching if already present."""
105+
raw_dir = work / "raw" / ref
106+
if raw_dir.is_dir() and any(raw_dir.iterdir()):
107+
print(f" (cached) {raw_dir}")
108+
return raw_dir
109+
gcs_copy_in(ref, raw_dir)
110+
return raw_dir
111+
112+
113+
def extract_one(job: ExtractJob, work: Path) -> Path:
114+
print(f"\n=== {job.dataset.upper()} {job.year} ===")
115+
raw_dir = ensure_raw(job.raw_ref, work)
116+
clean_dir = work / "clean" / job.dataset / str(job.year)
117+
clean_dir.mkdir(parents=True, exist_ok=True)
118+
run(
119+
[
120+
"cargo", "run", "--release", "--quiet", "--",
121+
job.rust_flag, str(raw_dir),
122+
"--year", str(job.year),
123+
"--extract", str(clean_dir),
124+
],
125+
cwd=REPO_ROOT,
126+
)
127+
gcs_copy_out(clean_dir, job.dataset, job.year)
128+
return clean_dir
129+
130+
131+
def extract_efrs(fiscal_year: int, frs_year: int, was_ref: str, lcfs_ref: str, work: Path) -> None:
132+
print(f"\n=== EFRS {fiscal_year} (from FRS {frs_year}, {was_ref}, {lcfs_ref}) ===")
133+
134+
# Need clean FRS as the base: if we already extracted it in this run it's on disk;
135+
# otherwise download the clean files from the bucket into work/clean/frs/<year>/.
136+
frs_clean = work / "clean" / "frs" / str(frs_year)
137+
if not frs_clean.is_dir() or not (frs_clean / "households.csv").exists():
138+
frs_clean.mkdir(parents=True, exist_ok=True)
139+
run([
140+
"gcloud", "storage", "cp",
141+
f"{BUCKET}/frs/{frs_year}/persons.csv",
142+
f"{BUCKET}/frs/{frs_year}/benunits.csv",
143+
f"{BUCKET}/frs/{frs_year}/households.csv",
144+
str(frs_clean) + "/",
145+
])
146+
147+
frs_base = work / "clean" / "frs" # parent dir with YYYY/ subdirs
148+
was_raw = ensure_raw(was_ref, work)
149+
lcfs_raw = ensure_raw(lcfs_ref, work)
150+
151+
efrs_out = work / "clean" / "efrs" / str(fiscal_year)
152+
efrs_out.mkdir(parents=True, exist_ok=True)
153+
run(
154+
[
155+
"cargo", "run", "--release", "--quiet", "--",
156+
"--extract-efrs", str(efrs_out),
157+
"--data", str(frs_base),
158+
"--year", str(fiscal_year),
159+
"--was-dir", str(was_raw),
160+
"--lcfs-dir", str(lcfs_raw),
161+
],
162+
cwd=REPO_ROOT,
163+
)
164+
gcs_copy_out(efrs_out, "efrs", fiscal_year)
165+
166+
167+
def main() -> None:
168+
parser = argparse.ArgumentParser(description=__doc__ or "")
169+
parser.add_argument(
170+
"--only",
171+
choices=["frs", "lcfs", "spi", "was", "efrs"],
172+
help="Only rebuild one dataset family",
173+
)
174+
parser.add_argument("--year", type=int, help="Only rebuild this fiscal year")
175+
parser.add_argument(
176+
"--work-dir",
177+
type=Path,
178+
help="Use this directory instead of a temp dir (enables caching)",
179+
)
180+
parser.add_argument(
181+
"--keep",
182+
action="store_true",
183+
help="Keep the working dir after running (ignored with --work-dir)",
184+
)
185+
args = parser.parse_args()
186+
187+
_require("gcloud")
188+
_require("cargo")
189+
190+
if args.work_dir:
191+
work = args.work_dir.resolve()
192+
work.mkdir(parents=True, exist_ok=True)
193+
cleanup = False
194+
else:
195+
work = Path(tempfile.mkdtemp(prefix="pe-uk-rebuild-"))
196+
cleanup = not args.keep
197+
print(f"Working directory: {work}")
198+
199+
selected_jobs = JOBS
200+
if args.only and args.only != "efrs":
201+
selected_jobs = [j for j in JOBS if j.dataset == args.only]
202+
if args.year is not None:
203+
selected_jobs = [j for j in selected_jobs if j.year == args.year]
204+
205+
run_efrs = args.only in (None, "efrs")
206+
207+
try:
208+
if args.only != "efrs":
209+
for job in selected_jobs:
210+
extract_one(job, work)
211+
212+
if run_efrs:
213+
for fiscal_year, frs_year, was_ref, lcfs_ref in EFRS_JOBS:
214+
if args.year is not None and fiscal_year != args.year:
215+
continue
216+
extract_efrs(fiscal_year, frs_year, was_ref, lcfs_ref, work)
217+
finally:
218+
if cleanup:
219+
shutil.rmtree(work, ignore_errors=True)
220+
221+
print("\nAll done.")
222+
223+
224+
if __name__ == "__main__":
225+
sys.exit(main())

0 commit comments

Comments
 (0)