Skip to content

Commit 3eaa686

Browse files
TrackD: add core_gl BYOD adapter v1 (#319)
1 parent 90690fb commit 3eaa686

5 files changed

Lines changed: 398 additions & 5 deletions

File tree

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# SPDX-License-Identifier: MIT
2+
"""Generic 'core_gl' adapter for Track D BYOD normalization.
3+
4+
This adapter is the bridge from "perfect" template exports (already matching the
5+
contract) to "slightly messy" Sheets/Excel exports.
6+
7+
Features (v1):
8+
- Header matching that tolerates case/spacing/punctuation (e.g., "Account ID")
9+
- Whitespace trimming across all cells
10+
- Money cleanup for debit/credit (commas, $, parentheses-as-negative)
11+
- Canonical output column order (required first, then passthrough extras)
12+
13+
Inputs
14+
------
15+
Reads contract-named files from ``tables/``:
16+
- chart_of_accounts.csv
17+
- gl_journal.csv
18+
19+
Outputs
20+
-------
21+
Writes contract-named files to ``normalized/`` with contract column names.
22+
"""
23+
24+
from __future__ import annotations
25+
26+
import csv
27+
from pathlib import Path
28+
from typing import Any
29+
30+
from .._errors import TrackDDataError, TrackDSchemaError
31+
from ..contracts import schemas_for_profile
32+
from .base import NormalizeContext
33+
from .mapping import (
34+
build_rename_map,
35+
clean_cell,
36+
detect_duplicate_destinations,
37+
parse_money,
38+
)
39+
40+
41+
_COA_ALIASES: dict[str, tuple[str, ...]] = {
42+
"account_id": ("acct_id", "acct", "account", "account number", "account_no"),
43+
"account_name": ("acct_name", "name"),
44+
"account_type": ("type",),
45+
"normal_side": ("normal", "side"),
46+
}
47+
48+
_GL_ALIASES: dict[str, tuple[str, ...]] = {
49+
"txn_id": ("txnid", "transaction_id", "transaction id", "id"),
50+
"doc_id": ("doc", "document", "document_id", "document id"),
51+
"description": ("desc", "memo", "narrative"),
52+
"account_id": ("acct_id", "acct", "account", "account number", "account_no"),
53+
"debit": ("dr", "debits"),
54+
"credit": ("cr", "credits"),
55+
}
56+
57+
58+
def _write_normalized_csv(
59+
src: Path,
60+
dst: Path,
61+
*,
62+
required_columns: tuple[str, ...],
63+
aliases: dict[str, tuple[str, ...]] | None = None,
64+
money_columns: tuple[str, ...] = (),
65+
) -> dict[str, Any]:
66+
with src.open("r", newline="", encoding="utf-8-sig") as f_in:
67+
reader = csv.DictReader(f_in)
68+
if not reader.fieldnames:
69+
raise TrackDDataError(f"CSV appears to have no header row: {src.name}")
70+
71+
fieldnames = [str(c) for c in reader.fieldnames if c is not None]
72+
rename_map = build_rename_map(fieldnames, required_columns=required_columns, aliases=aliases)
73+
74+
dups = detect_duplicate_destinations(rename_map)
75+
if dups:
76+
pieces = [f"{dst}: {', '.join(srcs)}" for dst, srcs in sorted(dups.items())]
77+
raise TrackDSchemaError(
78+
"Ambiguous column mapping (multiple source columns map to the same required column).\n"
79+
+ "\n".join(pieces)
80+
)
81+
82+
# Determine output fields: required columns first, then passthrough extras.
83+
required_set = set(required_columns)
84+
extras: list[str] = []
85+
for c in fieldnames:
86+
dest = rename_map.get(c, c)
87+
if dest in required_set:
88+
continue
89+
# Preserve original extra column names (trimmed).
90+
extras.append(c.strip())
91+
92+
out_fields = list(required_columns) + extras
93+
94+
dst.parent.mkdir(parents=True, exist_ok=True)
95+
with dst.open("w", newline="", encoding="utf-8") as f_out:
96+
writer = csv.DictWriter(f_out, fieldnames=out_fields)
97+
writer.writeheader()
98+
n_rows = 0
99+
for row in reader:
100+
out_row: dict[str, str] = {k: "" for k in out_fields}
101+
102+
# Map + clean required columns
103+
for src_col in fieldnames:
104+
raw_val = row.get(src_col)
105+
val = clean_cell(raw_val)
106+
dest = rename_map.get(src_col, src_col).strip()
107+
108+
# Extra columns: keep under original header (trimmed).
109+
if dest not in required_set:
110+
dest = src_col.strip()
111+
112+
if dest not in out_row:
113+
# If an extra column name collides with required, prefer required slot.
114+
continue
115+
116+
if dest in money_columns:
117+
val = parse_money(val)
118+
119+
out_row[dest] = val
120+
121+
writer.writerow(out_row)
122+
n_rows += 1
123+
124+
return {
125+
"src": str(src),
126+
"dst": str(dst),
127+
"written_rows": n_rows,
128+
"written_columns": out_fields,
129+
}
130+
131+
132+
class CoreGLAdapter:
133+
name = "core_gl"
134+
135+
def normalize(self, ctx: NormalizeContext) -> dict[str, Any]:
136+
schemas = schemas_for_profile(ctx.profile)
137+
138+
ctx.normalized_dir.mkdir(parents=True, exist_ok=True)
139+
140+
files: list[dict[str, Any]] = []
141+
for schema in schemas:
142+
src = ctx.tables_dir / schema.name
143+
dst = ctx.normalized_dir / schema.name
144+
if not src.exists():
145+
raise TrackDDataError(f"Missing required input file for adapter '{self.name}': {src}")
146+
147+
if schema.name == "chart_of_accounts.csv":
148+
aliases = _COA_ALIASES
149+
money_cols: tuple[str, ...] = ()
150+
elif schema.name == "gl_journal.csv":
151+
aliases = _GL_ALIASES
152+
money_cols = ("debit", "credit")
153+
else:
154+
aliases = None
155+
money_cols = ()
156+
157+
files.append(
158+
_write_normalized_csv(
159+
src,
160+
dst,
161+
required_columns=schema.required_columns,
162+
aliases=aliases,
163+
money_columns=money_cols,
164+
)
165+
)
166+
167+
return {
168+
"ok": True,
169+
"adapter": self.name,
170+
"profile": ctx.profile,
171+
"project": str(ctx.project_root),
172+
"tables_dir": str(ctx.tables_dir),
173+
"normalized_dir": str(ctx.normalized_dir),
174+
"files": files,
175+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# SPDX-License-Identifier: MIT
2+
"""Small mapping/cleaning utilities for Track D BYOD adapters.
3+
4+
Design goals (Phase 3.1):
5+
- Keep utilities tiny and dependency-light (csv-first).
6+
- Support *boring* transformations that come up in Sheets/Excel exports:
7+
- column-name normalization / rename matching
8+
- whitespace trimming
9+
- simple money parsing (commas, $, parentheses-as-negative)
10+
11+
These helpers are intentionally not a full ETL framework. They exist to keep
12+
individual adapters readable and consistent.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import re
18+
from typing import Iterable
19+
20+
21+
_RE_NON_ALNUM = re.compile(r"[^a-z0-9_]+")
22+
_RE_UNDERSCORES = re.compile(r"_+")
23+
_RE_MONEY = re.compile(r"^\(?\s*(?P<body>.*)\s*\)?$")
24+
25+
26+
def normalize_col_name(name: str) -> str:
27+
"""Normalize a column header for matching purposes.
28+
29+
Examples
30+
--------
31+
"Account ID" -> "account_id"
32+
" normal-side " -> "normal_side"
33+
"DOC-ID" -> "doc_id"
34+
"""
35+
36+
s = (name or "").strip().lower()
37+
s = s.replace("-", "_").replace(" ", "_")
38+
s = _RE_NON_ALNUM.sub("_", s)
39+
s = _RE_UNDERSCORES.sub("_", s).strip("_")
40+
return s
41+
42+
43+
def build_rename_map(
44+
fieldnames: Iterable[str],
45+
*,
46+
required_columns: tuple[str, ...],
47+
aliases: dict[str, tuple[str, ...]] | None = None,
48+
) -> dict[str, str]:
49+
"""Build a mapping from source fieldnames to required/normalized names.
50+
51+
Strategy:
52+
1) direct normalized match (case/spacing/punct insensitivity)
53+
2) optional aliases (also normalized), used only as a *fallback*
54+
55+
Why fallback-only?
56+
- Many exports include both a canonical column (e.g., "Description") and a
57+
near-synonym (e.g., "Memo"). We don't want aliases to create ambiguous
58+
mappings when a direct match already exists.
59+
60+
Returns a dict that maps *source column name* -> *destination column name*.
61+
"""
62+
63+
src = list(fieldnames)
64+
required_norm = {normalize_col_name(c): c for c in required_columns}
65+
66+
alias_norm: dict[str, str] = {}
67+
if aliases:
68+
for dest, alts in aliases.items():
69+
for a in alts:
70+
alias_norm[normalize_col_name(a)] = dest
71+
72+
out: dict[str, str] = {}
73+
claimed: set[str] = set()
74+
75+
# Pass 1: exact required matches
76+
for col in src:
77+
n = normalize_col_name(col)
78+
if n in required_norm:
79+
dest = required_norm[n]
80+
out[col] = dest
81+
claimed.add(dest)
82+
83+
# Pass 2: alias fallback (only if dest not already claimed)
84+
for col in src:
85+
if col in out:
86+
continue
87+
n = normalize_col_name(col)
88+
dest = alias_norm.get(n)
89+
if dest and dest not in claimed:
90+
out[col] = dest
91+
claimed.add(dest)
92+
93+
return out
94+
95+
96+
def detect_duplicate_destinations(rename_map: dict[str, str]) -> dict[str, list[str]]:
97+
"""Return destinations that are mapped from multiple sources."""
98+
99+
rev: dict[str, list[str]] = {}
100+
for src, dst in rename_map.items():
101+
rev.setdefault(dst, []).append(src)
102+
return {dst: srcs for dst, srcs in rev.items() if len(srcs) > 1}
103+
104+
105+
def clean_cell(value: object) -> str:
106+
"""Trim whitespace and coerce missing values to empty string."""
107+
if value is None:
108+
return ""
109+
s = str(value)
110+
return s.strip()
111+
112+
113+
def parse_money(value: str) -> str:
114+
"""Parse common spreadsheet money formats into a simple numeric string.
115+
116+
Supported patterns:
117+
- "$1,234.00" -> "1234.00"
118+
- "(1,234.00)" or "($1,234.00)" -> "-1234.00"
119+
- "-1,234" -> "-1234"
120+
121+
If the string is blank, returns blank.
122+
"""
123+
124+
s = (value or "").strip()
125+
if not s:
126+
return ""
127+
128+
neg = False
129+
if s.startswith("(") and s.endswith(")"):
130+
neg = True
131+
s = s[1:-1].strip()
132+
133+
# Strip currency and grouping separators.
134+
s = s.replace("$", "").replace(",", "").strip()
135+
if not s:
136+
return ""
137+
138+
# If it already has a leading minus, keep it.
139+
if s.startswith("-"):
140+
return s
141+
142+
return f"-{s}" if neg else s

src/pystatsv1/trackd/byod.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pathlib import Path
1818
from typing import Any
1919

20-
from ._errors import TrackDDataError
20+
from ._errors import TrackDDataError, TrackDSchemaError
2121
from ._types import PathLike
2222
from .adapters.base import NormalizeContext, TrackDAdapter
2323
from .contracts import ALLOWED_PROFILES, schemas_for_profile
@@ -244,11 +244,16 @@ def _get_adapter(name: str | None) -> TrackDAdapter:
244244
n = (name or "").strip().lower() or "passthrough"
245245
if n == "passthrough":
246246
return _PassthroughAdapter()
247+
if n == "core_gl":
248+
from .adapters.core_gl import CoreGLAdapter
249+
250+
return CoreGLAdapter()
247251
raise TrackDDataError(
248-
f"Unknown adapter: {name}.\n" "Use one of: passthrough"
252+
f"Unknown adapter: {name}.\n" "Use one of: passthrough, core_gl"
249253
)
250254

251255

256+
252257
def normalize_byod_project(project: PathLike, *, profile: str | None = None) -> dict[str, Any]:
253258
"""Normalize BYOD project tables into ``normalized/`` outputs.
254259
@@ -293,8 +298,20 @@ def normalize_byod_project(project: PathLike, *, profile: str | None = None) ->
293298

294299
adapter = _get_adapter(cfg.get("adapter"))
295300

296-
# Validate required schema issues first, so adapters can assume headers exist.
297-
validate_dataset(tables_dir, profile=p)
301+
# Validation strategy:
302+
# - passthrough expects contract-shaped inputs under tables/
303+
# - other adapters may accept non-canonical headers, so we validate after normalize
304+
if getattr(adapter, "name", "") == "passthrough":
305+
# Validate required schema issues first, so passthrough can assume headers exist.
306+
validate_dataset(tables_dir, profile=p)
307+
else:
308+
# Light check: required files must exist; detailed schema validation runs on normalized outputs.
309+
schemas = schemas_for_profile(p)
310+
missing = [s.name for s in schemas if not (tables_dir / s.name).exists()]
311+
if missing:
312+
raise TrackDSchemaError(
313+
"Missing required files in tables/: " + ", ".join(missing)
314+
)
298315

299316
ctx = NormalizeContext(
300317
project_root=root,
@@ -303,4 +320,10 @@ def normalize_byod_project(project: PathLike, *, profile: str | None = None) ->
303320
raw_dir=(root / "raw"),
304321
normalized_dir=(root / "normalized"),
305322
)
306-
return adapter.normalize(ctx)
323+
report = adapter.normalize(ctx)
324+
325+
if getattr(adapter, "name", "") != "passthrough":
326+
# Ensure adapter output conforms to the Track D contract.
327+
validate_dataset(ctx.normalized_dir, profile=p)
328+
329+
return report

tests/test_trackd_byod_adapter_selection_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ def test_trackd_byod_normalize_uses_adapter_from_config(tmp_path: Path, capsys)
2121
assert rc == 1
2222
assert "unknown adapter" in out
2323
assert "passthrough" in out
24+
assert "core_gl" in out

0 commit comments

Comments
 (0)