diff --git a/week7/community_contributions/IbrahimSheriff/pricer_beat_39.ipynb b/week7/community_contributions/IbrahimSheriff/pricer_beat_39.ipynb new file mode 100644 index 0000000000..210c2b5617 --- /dev/null +++ b/week7/community_contributions/IbrahimSheriff/pricer_beat_39.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Week 7: Beat the 39 — Price prediction (IbrahimSheriff2)\n", + "\n", + "**Goal:** Get average absolute error **below 39.85** (ideally into the lower 30s). \n", + "**Metric:** Same as instructor — average $ error on 250 test samples. \n", + "**HF user:** `sheriff`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q pandas scikit-learn datasets transformers torch peft bitsandbytes trl accelerate matplotlib python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import math\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "import torch\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from dotenv import load_dotenv\n", + "from datasets import load_dataset, Dataset\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n", + "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel\n", + "from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM\n", + "import matplotlib.pyplot as plt\n", + "\n", + "load_dotenv(override=True)\n", + "set_seed(42)\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Config (hyperparameters to tune)\n", + "\n", + "**Instructor baseline:** 39.85. Try varying these to get into the lower 30s. \n", + "**Tip:** Data manipulation (filtering, balancing, prompt format) often gives the biggest gain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HF_USER = \"sheriff\"\n", + "DATASET_NAME = \"ed-donner/pricer-data\"\n", + "BASE_MODEL = \"meta-llama/Llama-3.2-3B\"\n", + "PROJECT_NAME = \"pricer\"\n", + "\n", + "# --- Hyperparameters (tune these to beat 39.85) ---\n", + "NUM_EPOCHS = 2\n", + "LEARNING_RATE = 2e-4\n", + "PER_DEVICE_TRAIN_BATCH_SIZE = 4\n", + "GRADIENT_ACCUMULATION_STEPS = 4\n", + "MAX_SEQ_LENGTH = 256\n", + "LORA_R = 8\n", + "LORA_ALPHA = 32\n", + "LORA_DROPOUT = 0.05\n", + "WARMUP_RATIO = 0.05\n", + "WEIGHT_DECAY = 0.01\n", + "\n", + "# Optional: subsample for quick experiments (set to None to use full train)\n", + "TRAIN_SUBSAMPLE = None # e.g. 20000\n", + "\n", + "EVAL_SIZE = 250 # same as instructor Tester\n", + "hf_token = os.environ.get(\"HF_TOKEN\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(DATASET_NAME, token=hf_token)\n", + "train_raw = dataset[\"train\"]\n", + "test_raw = dataset[\"test\"]\n", + "print(f\"Train: {len(train_raw)}, Test: {len(test_raw)}\")\n", + "print(\"Columns:\", train_raw.column_names)\n", + "if len(train_raw) > 0:\n", + " ex = train_raw[0]\n", + " print(\"Sample keys:\", list(ex.keys()))\n", + " if \"text\" in ex:\n", + " print(\"Sample text (first 300 chars):\", (ex[\"text\"] or \"\")[:300])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare training data (prompt + completion)\n", + "\n", + "**Data manipulation idea:** You can filter by price range, oversample rare buckets, or clean `text` here to teach the model better." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_train_text(example):\n", + " # pricer-data: \"text\" is the prompt; model should complete with \"Price is $X\"\n", + " prompt = (example.get(\"text\") or \"\").strip()\n", + " price = example.get(\"price\")\n", + " if price is None:\n", + " return None\n", + " try:\n", + " p = float(price)\n", + " except (TypeError, ValueError):\n", + " return None\n", + " # Completion format expected at eval (extract_price looks for \"Price is $\")\n", + " completion = f\"Price is ${p:.2f}\"\n", + " return prompt + completion\n", + "\n", + "train_list = []\n", + "for i in range(len(train_raw)):\n", + " row = train_raw[i]\n", + " text = build_train_text(row)\n", + " if text:\n", + " train_list.append({\"text\": text})\n", + "\n", + "if TRAIN_SUBSAMPLE:\n", + " np.random.seed(42)\n", + " idx = np.random.choice(len(train_list), min(TRAIN_SUBSAMPLE, len(train_list)), replace=False)\n", + " train_list = [train_list[i] for i in idx]\n", + "\n", + "train_ds = Dataset.from_list(train_list)\n", + "print(f\"Training samples: {len(train_ds)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Validation: small subset from train for eval_strategy\n", + "val_size = min(500, len(train_ds) // 10)\n", + "val_ds = train_ds.select(range(val_size))\n", + "train_ds = train_ds.select(range(val_size, len(train_ds)))\n", + "print(f\"Train: {len(train_ds)}, Val: {len(val_ds)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model & QLoRA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + ")\n", + "model.generation_config.pad_token_id = tokenizer.pad_token_id\n", + "model = prepare_model_for_kbit_training(model)\n", + "\n", + "lora_config = LoraConfig(\n", + " r=LORA_R,\n", + " lora_alpha=LORA_ALPHA,\n", + " target_modules=[\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"],\n", + " lora_dropout=LORA_DROPOUT,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + ")\n", + "model = get_peft_model(model, lora_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Completion-only loss: only tokens after \"Price is $\" are trained\n", + "RESPONSE_TEMPLATE = \"Price is $\"\n", + "collator = DataCollatorForCompletionOnlyLM(RESPONSE_TEMPLATE, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}\"\n", + "OUTPUT_DIR = f\"{PROJECT_NAME}-{RUN_NAME}\"\n", + "\n", + "training_args = SFTConfig(\n", + " output_dir=OUTPUT_DIR,\n", + " run_name=RUN_NAME,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=MAX_SEQ_LENGTH,\n", + " num_train_epochs=NUM_EPOCHS,\n", + " per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,\n", + " per_device_eval_batch_size=4,\n", + " gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n", + " eval_strategy=\"steps\",\n", + " eval_steps=200,\n", + " learning_rate=LEARNING_RATE,\n", + " lr_scheduler_type=\"cosine\",\n", + " warmup_ratio=WARMUP_RATIO,\n", + " optim=\"paged_adamw_32bit\",\n", + " weight_decay=WEIGHT_DECAY,\n", + " bf16=True,\n", + " logging_steps=50,\n", + " save_strategy=\"steps\",\n", + " save_steps=500,\n", + " save_total_limit=2,\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"eval_loss\",\n", + " greater_is_better=False,\n", + " push_to_hub=False,\n", + ")\n", + "\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=train_ds,\n", + " eval_dataset=val_ds,\n", + " args=training_args,\n", + " data_collator=collator,\n", + ")\n", + "print(f\"Output dir: {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.train()\n", + "trainer.save_model(OUTPUT_DIR)\n", + "tokenizer.save_pretrained(OUTPUT_DIR)\n", + "print(f\"Saved to {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation — beat 39.85\n", + "\n", + "Load the saved adapter (or set `ADAPTER_PATH` to a previous run) and run the same metric as the instructor: **average absolute error** on 250 test samples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ADAPTER_PATH = OUTPUT_DIR # or e.g. \"pricer-2025-03-09_12.00.00\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + ")\n", + "model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)\n", + "model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_price(s):\n", + " if \"Price is $\" in s:\n", + " contents = s.split(\"Price is $\")[1].replace(\",\", \"\")\n", + " m = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n", + " return float(m.group()) if m else 0.0\n", + " return 0.0\n", + "\n", + "@torch.no_grad()\n", + "def predict(prompt, max_new_tokens=15):\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", + " out = model.generate(\n", + " **inputs,\n", + " max_new_tokens=max_new_tokens,\n", + " do_sample=False,\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " )\n", + " full = tokenizer.decode(out[0], skip_special_tokens=True)\n", + " return extract_price(full)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GREEN, YELLOW, RED, RESET = \"\\033[92m\", \"\\033[93m\", \"\\033[91m\", \"\\033[0m\"\n", + "COLOR_MAP = {\"red\": RED, \"orange\": YELLOW, \"green\": GREEN}\n", + "\n", + "class Tester:\n", + " def __init__(self, predictor, data, title=None, size=250):\n", + " self.predictor = predictor\n", + " self.data = data\n", + " self.title = title or \"Model\"\n", + " self.size = min(size, len(data))\n", + " self.guesses, self.truths, self.errors, self.sles, self.colors = [], [], [], [], []\n", + "\n", + " def color_for(self, error, truth):\n", + " if error < 40 or (truth and error / truth < 0.2):\n", + " return \"green\"\n", + " if error < 80 or (truth and error / truth < 0.4):\n", + " return \"orange\"\n", + " return \"red\"\n", + "\n", + " def run_datapoint(self, i):\n", + " row = self.data[i]\n", + " prompt = (row.get(\"text\") or \"\").strip()\n", + " truth = float(row.get(\"price\", 0))\n", + " guess = self.predictor(prompt)\n", + " error = abs(guess - truth)\n", + " log_err = math.log(truth + 1) - math.log(guess + 1)\n", + " sle = log_err ** 2\n", + " color = self.color_for(error, truth)\n", + " title = (prompt[:50] + \"...\") if len(prompt) > 50 else prompt\n", + " self.guesses.append(guess)\n", + " self.truths.append(truth)\n", + " self.errors.append(error)\n", + " self.sles.append(sle)\n", + " self.colors.append(color)\n", + " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:.2f} {title}{RESET}\")\n", + "\n", + " def report(self):\n", + " average_error = sum(self.errors) / self.size\n", + " rmsle = math.sqrt(sum(self.sles) / self.size)\n", + " hits = sum(1 for c in self.colors if c == \"green\")\n", + " title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:.2f} Hits={hits/self.size*100:.1f}%\"\n", + " plt.figure(figsize=(12, 8))\n", + " max_val = max(max(self.truths), max(self.guesses))\n", + " plt.plot([0, max_val], [0, max_val], color=\"deepskyblue\", lw=2, alpha=0.6)\n", + " plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n", + " plt.xlabel(\"Ground Truth\")\n", + " plt.ylabel(\"Model Estimate\")\n", + " plt.title(title)\n", + " plt.show()\n", + " return average_error\n", + "\n", + " def run(self):\n", + " for i in range(self.size):\n", + " self.run_datapoint(i)\n", + " return self.report()\n", + "\n", + " @classmethod\n", + " def test(cls, predictor, data, title=None, size=250):\n", + " t = cls(predictor, data, title=title, size=size)\n", + " return t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = test_raw.select(range(EVAL_SIZE))\n", + "avg_error = Tester.test(predict, test_data)\n", + "print(f\"\\n>>> Average error: ${avg_error:,.2f} (instructor baseline: 39.85; goal: < 39, lower 30s)\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}