diff --git a/README.md b/README.md index b6ab1e4..b56766a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,70 @@ # pricing-model-lab + Python pricing analysis lab for comparing model-derived values, observed values, and deviation thresholds for decision support workflows. + +## Features + +- Load a CSV with columns: `item_id`, `observed_value`, `model_value` +- Validate inputs (numeric checks, missing-value detection) +- Calculate relative deviation: `(observed_value - model_value) / model_value` +- Add a `review_flag` column based on a configurable threshold (default **3%**) +- Output a clean CSV sorted by absolute deviation (largest first) +- CLI interface via `argparse` + +## Project Structure + +``` +pricing-model-lab/ +├── data/ +│ └── sample_data.csv # Synthetic sample dataset +├── pricing_model_lab/ +│ ├── __init__.py +│ ├── processor.py # Core pipeline logic +│ └── cli.py # Command-line interface +├── tests/ +│ └── test_processor.py # pytest tests +├── pyproject.toml +└── requirements.txt +``` + +## Installation + +```bash +pip install -e . +``` + +## Usage + +### Command Line + +```bash +pricing-model-lab INPUT_CSV OUTPUT_CSV [--threshold THRESHOLD] +``` + +| Argument | Description | +|---|---| +| `INPUT_CSV` | Path to input CSV (columns: `item_id`, `observed_value`, `model_value`) | +| `OUTPUT_CSV` | Path for the processed output CSV | +| `--threshold` | Absolute deviation threshold for the review flag (default: `0.03` → 3%) | + +**Example:** + +```bash +pricing-model-lab data/sample_data.csv output.csv --threshold 0.05 +``` + +### Python API + +```python +from pricing_model_lab.processor import process + +result = process("data/sample_data.csv", "output.csv", threshold=0.03) +print(result.head()) +``` + +## Running Tests + +```bash +pytest +``` + diff --git a/data/sample_data.csv b/data/sample_data.csv new file mode 100644 index 0000000..e29555b --- /dev/null +++ b/data/sample_data.csv @@ -0,0 +1,21 @@ +item_id,observed_value,model_value +ITEM_001,102.50,100.00 +ITEM_002,98.00,100.00 +ITEM_003,105.00,100.00 +ITEM_004,107.50,100.00 +ITEM_005,95.00,100.00 +ITEM_006,110.00,100.00 +ITEM_007,99.50,100.00 +ITEM_008,88.00,100.00 +ITEM_009,101.00,100.00 +ITEM_010,115.00,100.00 +ITEM_011,52.00,50.00 +ITEM_012,48.50,50.00 +ITEM_013,53.50,50.00 +ITEM_014,57.00,50.00 +ITEM_015,46.00,50.00 +ITEM_016,200.00,195.00 +ITEM_017,185.00,195.00 +ITEM_018,203.00,195.00 +ITEM_019,210.00,195.00 +ITEM_020,180.00,195.00 diff --git a/pricing_model_lab/__init__.py b/pricing_model_lab/__init__.py new file mode 100644 index 0000000..84d01a2 --- /dev/null +++ b/pricing_model_lab/__init__.py @@ -0,0 +1,3 @@ +"""pricing-model-lab: compare observed vs. model values and flag deviations.""" + +__version__ = "0.1.0" diff --git a/pricing_model_lab/cli.py b/pricing_model_lab/cli.py new file mode 100644 index 0000000..aa5c8cd --- /dev/null +++ b/pricing_model_lab/cli.py @@ -0,0 +1,62 @@ +"""Command-line interface for the pricing model lab.""" + +from __future__ import annotations + +import argparse +import sys + +from pricing_model_lab.processor import DEFAULT_THRESHOLD, process + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="pricing-model-lab", + description=( + "Compare observed vs. model values, compute deviations, " + "and flag items that exceed a configurable threshold." + ), + ) + parser.add_argument( + "input", + metavar="INPUT_CSV", + help="Path to the input CSV file (columns: item_id, observed_value, model_value).", + ) + parser.add_argument( + "output", + metavar="OUTPUT_CSV", + help="Path where the processed output CSV will be written.", + ) + parser.add_argument( + "--threshold", + type=float, + default=DEFAULT_THRESHOLD, + metavar="THRESHOLD", + help=( + "Absolute deviation threshold for the review flag " + f"(default: {DEFAULT_THRESHOLD * 100:.0f}%%)." + ), + ) + return parser + + +def main(argv: list[str] | None = None) -> None: + parser = build_parser() + args = parser.parse_args(argv) + + try: + result = process(args.input, args.output, threshold=args.threshold) + except (FileNotFoundError, ValueError) as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) + + flagged = result["review_flag"].sum() + total = len(result) + print( + f"Processed {total} items. " + f"{flagged} flagged (|deviation| > {args.threshold:.2%}). " + f"Output written to: {args.output}" + ) + + +if __name__ == "__main__": + main() diff --git a/pricing_model_lab/processor.py b/pricing_model_lab/processor.py new file mode 100644 index 0000000..c2f8c6a --- /dev/null +++ b/pricing_model_lab/processor.py @@ -0,0 +1,121 @@ +"""Core processing logic for the pricing model lab.""" + +from __future__ import annotations + +import pandas as pd + +REQUIRED_COLUMNS = {"item_id", "observed_value", "model_value"} +DEFAULT_THRESHOLD = 0.03 # 3% + + +def load_csv(filepath: str) -> pd.DataFrame: + """Load a CSV file and return a DataFrame. + + Raises + ------ + FileNotFoundError + If *filepath* does not exist. + ValueError + If required columns are missing. + """ + df = pd.read_csv(filepath) + + missing = REQUIRED_COLUMNS - set(df.columns) + if missing: + raise ValueError(f"CSV is missing required columns: {sorted(missing)}") + + return df + + +def validate(df: pd.DataFrame) -> pd.DataFrame: + """Validate that numeric columns are numeric and contain no missing values. + + Returns a cleaned copy with numeric columns cast to float. + + Raises + ------ + ValueError + If non-numeric data or missing values are found in *observed_value* + or *model_value*. + """ + df = df.copy() + + for col in ("observed_value", "model_value"): + df[col] = pd.to_numeric(df[col], errors="coerce") + + if df[col].isna().any(): + raise ValueError( + f"Column '{col}' contains missing or non-numeric values." + ) + + if (df["model_value"] == 0).any(): + raise ValueError( + "Column 'model_value' contains zero values; deviation is undefined." + ) + + if df["item_id"].isna().any(): + raise ValueError("Column 'item_id' contains missing values.") + + return df + + +def calculate_deviation(df: pd.DataFrame) -> pd.DataFrame: + """Add a *deviation* column: (observed_value - model_value) / model_value.""" + df = df.copy() + df["deviation"] = (df["observed_value"] - df["model_value"]) / df["model_value"] + return df + + +def add_review_flag( + df: pd.DataFrame, threshold: float = DEFAULT_THRESHOLD +) -> pd.DataFrame: + """Add a boolean *review_flag* column. + + A row is flagged when the absolute deviation exceeds *threshold*. + """ + if threshold < 0: + raise ValueError("threshold must be non-negative.") + + df = df.copy() + df["review_flag"] = df["deviation"].abs() > threshold + return df + + +def sort_by_abs_deviation(df: pd.DataFrame) -> pd.DataFrame: + """Return *df* sorted by absolute deviation in descending order.""" + df = df.copy() + df["_abs_deviation"] = df["deviation"].abs() + df = df.sort_values("_abs_deviation", ascending=False).drop( + columns=["_abs_deviation"] + ) + return df.reset_index(drop=True) + + +def process( + input_path: str, + output_path: str, + threshold: float = DEFAULT_THRESHOLD, +) -> pd.DataFrame: + """Run the full pipeline and write results to *output_path*. + + Parameters + ---------- + input_path: + Path to the input CSV file. + output_path: + Path where the output CSV will be written. + threshold: + Absolute deviation threshold for the review flag (default 3 %). + + Returns + ------- + pd.DataFrame + The processed DataFrame that was written to *output_path*. + """ + df = load_csv(input_path) + df = validate(df) + df = calculate_deviation(df) + df = add_review_flag(df, threshold=threshold) + df = sort_by_abs_deviation(df) + df.to_csv(output_path, index=False) + return df diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6c13712 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pricing-model-lab" +version = "0.1.0" +description = "Compare observed vs. model values, compute deviations, and flag outliers." +readme = "README.md" +requires-python = ">=3.9" +license = { text = "MIT" } +dependencies = [ + "pandas>=2.0", +] + +[project.scripts] +pricing-model-lab = "pricing_model_lab.cli:main" + +[tool.setuptools.packages.find] +where = ["."] +include = ["pricing_model_lab*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3ceaf54 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pandas>=2.0 +pytest>=7.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 0000000..414349e --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,202 @@ +"""Tests for pricing_model_lab.processor.""" + +from __future__ import annotations + +import io +import textwrap + +import pandas as pd +import pytest + +from pricing_model_lab.processor import ( + add_review_flag, + calculate_deviation, + load_csv, + process, + sort_by_abs_deviation, + validate, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_df( + item_ids=("A", "B", "C"), + observed=(102.0, 98.0, 110.0), + model=(100.0, 100.0, 100.0), +) -> pd.DataFrame: + return pd.DataFrame( + { + "item_id": list(item_ids), + "observed_value": list(observed), + "model_value": list(model), + } + ) + + +# --------------------------------------------------------------------------- +# load_csv +# --------------------------------------------------------------------------- + +def test_load_csv_success(tmp_path): + csv_content = "item_id,observed_value,model_value\nA,100,100\n" + p = tmp_path / "data.csv" + p.write_text(csv_content) + df = load_csv(str(p)) + assert list(df.columns) == ["item_id", "observed_value", "model_value"] + assert len(df) == 1 + + +def test_load_csv_missing_columns(tmp_path): + p = tmp_path / "bad.csv" + p.write_text("item_id,observed_value\nA,100\n") + with pytest.raises(ValueError, match="missing required columns"): + load_csv(str(p)) + + +def test_load_csv_file_not_found(): + with pytest.raises(FileNotFoundError): + load_csv("/nonexistent/path/file.csv") + + +# --------------------------------------------------------------------------- +# validate +# --------------------------------------------------------------------------- + +def test_validate_success(): + df = _make_df() + result = validate(df) + assert result["observed_value"].dtype == float + assert result["model_value"].dtype == float + + +def test_validate_non_numeric_observed(): + df = _make_df(observed=("bad", 98.0, 110.0)) + with pytest.raises(ValueError, match="observed_value"): + validate(df) + + +def test_validate_non_numeric_model(): + df = _make_df(model=(100.0, "n/a", 100.0)) + with pytest.raises(ValueError, match="model_value"): + validate(df) + + +def test_validate_missing_observed(): + df = _make_df(observed=(None, 98.0, 110.0)) + with pytest.raises(ValueError, match="observed_value"): + validate(df) + + +def test_validate_zero_model_value(): + df = _make_df(model=(0.0, 100.0, 100.0)) + with pytest.raises(ValueError, match="zero values"): + validate(df) + + +def test_validate_missing_item_id(): + df = _make_df(item_ids=(None, "B", "C")) + with pytest.raises(ValueError, match="item_id"): + validate(df) + + +# --------------------------------------------------------------------------- +# calculate_deviation +# --------------------------------------------------------------------------- + +def test_calculate_deviation_values(): + df = _make_df(observed=(110.0, 90.0, 100.0), model=(100.0, 100.0, 100.0)) + result = calculate_deviation(df) + assert "deviation" in result.columns + assert result.loc[0, "deviation"] == pytest.approx(0.10) + assert result.loc[1, "deviation"] == pytest.approx(-0.10) + assert result.loc[2, "deviation"] == pytest.approx(0.00) + + +def test_calculate_deviation_does_not_mutate(): + df = _make_df() + original_cols = list(df.columns) + calculate_deviation(df) + assert list(df.columns) == original_cols + + +# --------------------------------------------------------------------------- +# add_review_flag +# --------------------------------------------------------------------------- + +def test_add_review_flag_default_threshold(): + df = _make_df(observed=(103.1, 96.9, 100.0), model=(100.0, 100.0, 100.0)) + df = calculate_deviation(df) + result = add_review_flag(df) + # deviations: 0.031, -0.031, 0.0 + assert result.loc[0, "review_flag"] + assert result.loc[1, "review_flag"] + assert not result.loc[2, "review_flag"] + + +def test_add_review_flag_custom_threshold(): + df = _make_df(observed=(105.0, 100.0, 100.0), model=(100.0, 100.0, 100.0)) + df = calculate_deviation(df) + result = add_review_flag(df, threshold=0.10) + # deviation 5% < 10% → not flagged + assert not result.loc[0, "review_flag"] + + +def test_add_review_flag_negative_threshold(): + df = calculate_deviation(_make_df()) + with pytest.raises(ValueError, match="non-negative"): + add_review_flag(df, threshold=-0.01) + + +# --------------------------------------------------------------------------- +# sort_by_abs_deviation +# --------------------------------------------------------------------------- + +def test_sort_by_abs_deviation_order(): + df = _make_df(observed=(101.0, 120.0, 95.0), model=(100.0, 100.0, 100.0)) + df = calculate_deviation(df) + result = sort_by_abs_deviation(df) + abs_devs = result["deviation"].abs().tolist() + assert abs_devs == sorted(abs_devs, reverse=True) + + +def test_sort_by_abs_deviation_no_extra_columns(): + df = calculate_deviation(_make_df()) + result = sort_by_abs_deviation(df) + assert "_abs_deviation" not in result.columns + + +# --------------------------------------------------------------------------- +# process (end-to-end) +# --------------------------------------------------------------------------- + +def test_process_end_to_end(tmp_path): + csv_content = textwrap.dedent("""\ + item_id,observed_value,model_value + X,110,100 + Y,100,100 + Z,85,100 + """) + input_path = tmp_path / "input.csv" + input_path.write_text(csv_content) + output_path = tmp_path / "output.csv" + + result = process(str(input_path), str(output_path), threshold=0.03) + + assert output_path.exists() + written = pd.read_csv(str(output_path)) + assert list(written.columns) == [ + "item_id", "observed_value", "model_value", "deviation", "review_flag" + ] + # Z has largest abs deviation (−15%), X second (10%), Y zero + assert written.loc[0, "item_id"] == "Z" + assert written.loc[1, "item_id"] == "X" + assert written.loc[2, "item_id"] == "Y" + # Z and X should be flagged; Y should not + assert written.loc[0, "review_flag"] + assert written.loc[1, "review_flag"] + assert not written.loc[2, "review_flag"] + # row count matches + assert len(result) == 3