Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions docs/docs/evaluation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,13 @@ Evaluate from a list of Python dicts:
report = evaluate(predictions)
print(report)

Providing your own DB and questions (skip workdir):
Using a persistent cache directory for benchmark downloads:

.. code-block:: python

report = evaluate(
"path_to_outputs.jsonl",
questions_path="bench/questions.jsonl",
db_path="bench/sqlite_tables.db",
workdir_path=None
workdir_path="./benchmark-cache",
)

Function Arguments
Expand All @@ -59,11 +57,7 @@ Function Arguments
* - outputs
- Path to JSONL file or a list of prediction dicts (required).
* - workdir_path
- Directory for automatic benchmark downloads. Ignored if both questions_path and db_path are provided. Default: "llmsql_workdir".
* - questions_path
- Optional path to benchmark questions JSONL file.
* - db_path
- Optional path to SQLite DB with evaluation tables.
- Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically.
* - save_report
- Path to save detailed JSON report. Defaults to "evaluation_results_{uuid}.json".
* - show_mismatches
Expand Down
2 changes: 0 additions & 2 deletions docs/docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ Example: Running your first evaluation (with transformers backend)
results = inference_transformers(
model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
output_file="outputs/preds_transformers.jsonl",
questions_path="data/questions.jsonl",
tables_path="data/tables.jsonl",
num_fewshots=5,
batch_size=8,
max_new_tokens=256,
Expand Down
6 changes: 2 additions & 4 deletions docs/docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ Using transformers backend.
results = inference_transformers(
model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
output_file="outputs/preds_transformers.jsonl",
questions_path="data/questions.jsonl",
tables_path="data/tables.jsonl",
workdir_path="./benchmark-cache",
num_fewshots=5,
batch_size=8,
max_new_tokens=256,
Expand Down Expand Up @@ -58,8 +57,7 @@ Using vllm backend.
results = inference_vllm(
model_name="Qwen/Qwen2.5-1.5B-Instruct",
output_file="outputs/preds_vllm.jsonl",
questions_path="data/questions.jsonl",
tables_path="data/tables.jsonl",
workdir_path="./benchmark-cache",
num_fewshots=5,
batch_size=8,
max_new_tokens=256,
Expand Down
48 changes: 13 additions & 35 deletions llmsql/_cli/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,26 @@
from typing import Any

from llmsql._cli.subparsers import SubCommand
from llmsql.config.config import DEFAULT_LLMSQL_VERSION, DEFAULT_WORKDIR_PATH
from llmsql.config.config import DEFAULT_LLMSQL_VERSION
from llmsql.evaluation.evaluate import evaluate


class Evaluate(SubCommand):
"""Command for LLM evaluation"""

def __init__(
self,
subparsers:argparse._SubParsersAction,
*args:Any,
**kwargs:Any
)->None:
self, subparsers: argparse._SubParsersAction, *args: Any, **kwargs: Any
) -> None:
self._parser = subparsers.add_parser(
"evaluate",
help = "Evaluate predictions against the LLMSQL benchmark",
help="Evaluate predictions against the LLMSQL benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter,
)

self._add_args()
self._parser.set_defaults(func = self._execute)
self._parser.set_defaults(func=self._execute)


def _add_args(self)->None:
def _add_args(self) -> None:
"""Add evaluation-specific arguments to the parser."""
self._parser.add_argument(
"--outputs",
Expand All @@ -36,30 +32,15 @@ def _add_args(self)->None:

self._parser.add_argument(
"--version",
type = str,
default = DEFAULT_LLMSQL_VERSION,
choices=["1.0","2.0"],
help = f"LLMSQL benchmark version (default:{DEFAULT_LLMSQL_VERSION})"
type=str,
default=DEFAULT_LLMSQL_VERSION,
choices=["1.0", "2.0"],
help=f"LLMSQL benchmark version (default:{DEFAULT_LLMSQL_VERSION})",
)

self._parser.add_argument(
"--workdir-path",
default = DEFAULT_WORKDIR_PATH,
help = f"Directory for benchmark files (default: {DEFAULT_WORKDIR_PATH})",
)

self._parser.add_argument(
"--questions-path",
type = str,
default = None,
help = "Manual path to benchmark questions JSON file.",
)

self._parser.add_argument(
"--db-path",
type=str,
default = None,
help = "Path to SQLite benchmark database.",
help="Directory for benchmark downloads. If omitted, a temporary directory is used.",
)

self._parser.add_argument(
Expand All @@ -82,21 +63,18 @@ def _add_args(self)->None:
default=None,
help="Path to save evaluation report JSON.",
)

@staticmethod
def _execute(args: argparse.Namespace) -> None:
"""Execute the evaluate function with parsed arguments."""
try:
evaluate(
outputs=args.outputs,
version = args.version,
version=args.version,
workdir_path=args.workdir_path,
questions_path=args.questions_path,
db_path=args.db_path,
save_report=args.save_report,
show_mismatches=args.show_mismatches,
max_mismatches=args.max_mismatches,
)
except Exception as e:
print(f"Error during evaluation: {e}")

14 changes: 5 additions & 9 deletions llmsql/_cli/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ def _add_args(self) -> None:
def add_common_benchmark_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--version", default="2.0", choices=["1.0", "2.0"])
parser.add_argument("--output-file", default="llm_sql_predictions.jsonl")
parser.add_argument("--questions-path")
parser.add_argument("--tables-path")
parser.add_argument("--workdir-path", default="./workdir")
parser.add_argument(
"--workdir-path",
default=None,
help="Directory for benchmark downloads. If omitted, a temporary directory is used.",
)
parser.add_argument("--num-fewshots", type=int, default=5)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--seed", type=int, default=42)
Expand Down Expand Up @@ -217,8 +219,6 @@ def _execute_transformers(args: argparse.Namespace) -> None:
generation_kwargs=args.generation_kwargs,
version=args.version,
output_file=args.output_file,
questions_path=args.questions_path,
tables_path=args.tables_path,
workdir_path=args.workdir_path,
num_fewshots=args.num_fewshots,
batch_size=args.batch_size,
Expand All @@ -243,8 +243,6 @@ def _execute_vllm(args: argparse.Namespace) -> None:
sampling_kwargs=args.sampling_kwargs,
version=args.version,
output_file=args.output_file,
questions_path=args.questions_path,
tables_path=args.tables_path,
workdir_path=args.workdir_path,
limit=args.limit,
num_fewshots=args.num_fewshots,
Expand All @@ -267,8 +265,6 @@ def _execute_api(args: argparse.Namespace) -> None:
request_headers=args.request_headers,
version=args.version,
output_file=args.output_file,
questions_path=args.questions_path,
tables_path=args.tables_path,
workdir_path=args.workdir_path,
limit=args.limit,
num_fewshots=args.num_fewshots,
Expand Down
1 change: 0 additions & 1 deletion llmsql/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
}

DEFAULT_LLMSQL_VERSION: Literal["1.0", "2.0"] = "2.0"
DEFAULT_WORKDIR_PATH = "llmsql_workdir"


def get_repo_id(version: str = DEFAULT_LLMSQL_VERSION) -> str:
Expand Down
8 changes: 2 additions & 6 deletions llmsql/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ print(report)
evaluate(
outputs,
*,
workdir_path: str | None = "llmsql_workdir",
questions_path: str | None = None,
db_path: str | None = None,
workdir_path: str | None = None,
save_report: str | None = None,
show_mismatches: bool = True,
max_mismatches: int = 5,
Expand All @@ -55,9 +53,7 @@ evaluate(
| Argument | Description |
| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `outputs` | **Required**. Either a path to a JSONL file or a list of dicts with predictions. |
| `workdir_path` | Directory for automatic download of benchmark files (ignored if both `questions_path` and `db_path` are provided). Default: `"llmsql_workdir"`. |
| `questions_path` | Optional path to benchmark questions JSONL file. |
| `db_path` | Optional path to SQLite DB with evaluation tables. |
| `workdir_path` | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically. |
| `save_report` | Optional path to save detailed JSON report. Defaults to `evaluation_results_{uuid}.json`. |
| `show_mismatches` | Print mismatches while evaluating. Default: `True`. |
| `max_mismatches` | Maximum number of mismatches to print. Default: `5`. |
Expand Down
46 changes: 7 additions & 39 deletions llmsql/evaluation/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,19 @@
"""

from datetime import datetime, timezone
from pathlib import Path
import uuid

from rich.progress import track

from llmsql.config.config import (
DEFAULT_LLMSQL_VERSION,
DEFAULT_WORKDIR_PATH,
get_repo_id,
)
from llmsql.utils.evaluation_utils import (
connect_sqlite,
download_benchmark_file,
evaluate_sample,
)
from llmsql.utils.inference_utils import _maybe_download, resolve_workdir_path
from llmsql.utils.rich_utils import log_mismatch, print_summary
from llmsql.utils.utils import load_jsonl, load_jsonl_dict_by_key, save_json_report

Expand All @@ -32,9 +30,7 @@ def evaluate(
outputs: str | list[dict[int, str | int]],
*,
version: str = DEFAULT_LLMSQL_VERSION,
workdir_path: str | None = DEFAULT_WORKDIR_PATH,
questions_path: str | None = None,
db_path: str | None = None,
workdir_path: str | None = None,
save_report: str | None = None,
show_mismatches: bool = True,
max_mismatches: int = 5,
Expand All @@ -45,9 +41,8 @@ def evaluate(
Args:
version: LLMSQL version
outputs: Either a JSONL file path or a list of dicts.
workdir_path: Directory for auto-downloads (ignored if all paths provided).
questions_path: Manual path to benchmark questions JSONL.
db_path: Manual path to SQLite benchmark DB.
workdir_path: Directory to store downloaded benchmark files. If omitted, a
temporary directory is created automatically.
save_report: Optional manual save path. If None → auto-generated.
show_mismatches: Print mismatches while evaluating.
max_mismatches: Max mismatches to print.
Expand All @@ -58,39 +53,12 @@ def evaluate(

# Determine input type
input_mode = "jsonl_path" if isinstance(outputs, str) else "dict_list"
workdir = resolve_workdir_path(workdir_path)

repo_id = get_repo_id(version)

# --- Resolve inputs if needed ---
workdir = Path(workdir_path) if workdir_path else None
if workdir_path is not None and (questions_path is None or db_path is None):
workdir.mkdir(parents=True, exist_ok=True) # type: ignore

if questions_path is None:
if workdir is None:
raise ValueError(
"questions_path not provided, and workdir_path disabled. "
"Enable workdir or provide questions_path explicitly."
)
local_q = workdir / "questions.jsonl"
questions_path = (
str(local_q)
if local_q.is_file()
else download_benchmark_file(repo_id, "questions.jsonl", workdir)
)

if db_path is None:
if workdir is None:
raise ValueError(
"db_path not provided, and workdir_path disabled. "
"Enable workdir or provide db_path explicitly."
)
local_db = workdir / "sqlite_tables.db"
db_path = (
str(local_db)
if local_db.is_file()
else download_benchmark_file(repo_id, "sqlite_tables.db", workdir)
)
questions_path = _maybe_download(repo_id, "questions.jsonl", workdir)
db_path = _maybe_download(repo_id, "sqlite_tables.db", workdir)

# --- Load benchmark questions ---
questions = load_jsonl_dict_by_key(questions_path, key="question_id")
Expand Down
11 changes: 3 additions & 8 deletions llmsql/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ from llmsql import inference_transformers
results = inference_transformers(
model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
output_file="outputs/preds_transformers.jsonl",
questions_path="data/questions.jsonl",
tables_path="data/tables.jsonl",
workdir_path="./benchmark-cache",
num_fewshots=5,
batch_size=8,
max_new_tokens=256,
Expand Down Expand Up @@ -168,9 +167,7 @@ Runs inference using the Hugging Face `transformers` backend.
| Argument | Type | Default | Description |
| ------------------------------- | ------- | ------------------------- | ------------------------------------------------ |
| `output_file` | `str` | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL. |
| `questions_path` | `str \| None` | `None` | Path to questions.jsonl (auto-downloads if missing). |
| `tables_path` | `str \| None` | `None` | Path to tables.jsonl (auto-downloads if missing). |
| `workdir_path` | `str` | `"llmsql_workdir"` | Working directory for downloaded files. |
| `workdir_path` | `str \| None` | `None` | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically. |
| `num_fewshots` | `int` | `5` | Number of few-shot examples (0, 1, or 5). |
| `batch_size` | `int` | `8` | Batch size for inference. |
| `seed` | `int` | `42` | Random seed for reproducibility. |
Expand Down Expand Up @@ -210,9 +207,7 @@ Runs inference using the [vLLM](https://github.com/vllm-project/vllm) backend fo
| Argument | Type | Default | Description |
| ------------------------------- | -------------- | ----------------------------- | ------------------------------------------------ |
| `output_file` | `str` | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL. |
| `questions_path` | `str \| None` | `None` | Path to questions.jsonl (auto-downloads if missing). |
| `tables_path` | `str \| None` | `None` | Path to tables.jsonl (auto-downloads if missing). |
| `workdir_path` | `str` | `"llmsql_workdir"` | Working directory for downloaded files. |
| `workdir_path` | `str \| None` | `None` | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically. |
| `num_fewshots` | `int` | `5` | Number of few-shot examples (0, 1, or 5). |
| `batch_size` | `int` | `8` | Number of prompts per batch. |
| `seed` | `int` | `42` | Random seed for reproducibility. |
Expand Down
Loading
Loading