LLMSQL · DzmitryPihulski · Mar 21, 2026
@@ -36,15 +36,13 @@ Evaluate from a list of Python dicts:
     report = evaluate(predictions)
     print(report)
 
-Providing your own DB and questions (skip workdir):
+Using a persistent cache directory for benchmark downloads:
 
 .. code-block:: python
 
     report = evaluate(
         "path_to_outputs.jsonl",
-        questions_path="bench/questions.jsonl",
-        db_path="bench/sqlite_tables.db",
-        workdir_path=None
+        workdir_path="./benchmark-cache",
     )
 
 Function Arguments
@@ -59,11 +57,7 @@ Function Arguments
    * - outputs
      - Path to JSONL file or a list of prediction dicts (required).
    * - workdir_path
-     - Directory for automatic benchmark downloads. Ignored if both questions_path and db_path are provided. Default: "llmsql_workdir".
-   * - questions_path
-     - Optional path to benchmark questions JSONL file.
-   * - db_path
-     - Optional path to SQLite DB with evaluation tables.
+     - Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically.
    * - save_report
      - Path to save detailed JSON report. Defaults to "evaluation_results_{uuid}.json".
    * - show_mismatches

@@ -36,8 +36,6 @@ Example: Running your first evaluation (with transformers backend)
     results = inference_transformers(
         model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
         output_file="outputs/preds_transformers.jsonl",
-        questions_path="data/questions.jsonl",
-        tables_path="data/tables.jsonl",
         num_fewshots=5,
         batch_size=8,
         max_new_tokens=256,

@@ -27,8 +27,7 @@ Using transformers backend.
     results = inference_transformers(
         model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
         output_file="outputs/preds_transformers.jsonl",
-        questions_path="data/questions.jsonl",
-        tables_path="data/tables.jsonl",
+        workdir_path="./benchmark-cache",
         num_fewshots=5,
         batch_size=8,
         max_new_tokens=256,
@@ -58,8 +57,7 @@ Using vllm backend.
     results = inference_vllm(
         model_name="Qwen/Qwen2.5-1.5B-Instruct",
         output_file="outputs/preds_vllm.jsonl",
-        questions_path="data/questions.jsonl",
-        tables_path="data/tables.jsonl",
+        workdir_path="./benchmark-cache",
         num_fewshots=5,
         batch_size=8,
         max_new_tokens=256,

@@ -2,30 +2,26 @@
 from typing import Any
 
 from llmsql._cli.subparsers import SubCommand
-from llmsql.config.config import DEFAULT_LLMSQL_VERSION, DEFAULT_WORKDIR_PATH
+from llmsql.config.config import DEFAULT_LLMSQL_VERSION
 from llmsql.evaluation.evaluate import evaluate
 
 
 class Evaluate(SubCommand):
     """Command for LLM evaluation"""
 
     def __init__(
-            self,
-            subparsers:argparse._SubParsersAction,
-            *args:Any,
-            **kwargs:Any
-    )->None:
+        self, subparsers: argparse._SubParsersAction, *args: Any, **kwargs: Any
+    ) -> None:
         self._parser = subparsers.add_parser(
             "evaluate",
-            help = "Evaluate predictions against the LLMSQL benchmark",
+            help="Evaluate predictions against the LLMSQL benchmark",
             formatter_class=argparse.RawDescriptionHelpFormatter,
         )
 
         self._add_args()
-        self._parser.set_defaults(func = self._execute)
+        self._parser.set_defaults(func=self._execute)
 
-
-    def _add_args(self)->None:
+    def _add_args(self) -> None:
         """Add evaluation-specific arguments to the parser."""
         self._parser.add_argument(
             "--outputs",
@@ -36,30 +32,15 @@ def _add_args(self)->None:
 
         self._parser.add_argument(
             "--version",
-            type = str,
-            default = DEFAULT_LLMSQL_VERSION,
-            choices=["1.0","2.0"],
-            help = f"LLMSQL benchmark version (default:{DEFAULT_LLMSQL_VERSION})"
+            type=str,
+            default=DEFAULT_LLMSQL_VERSION,
+            choices=["1.0", "2.0"],
+            help=f"LLMSQL benchmark version (default:{DEFAULT_LLMSQL_VERSION})",
         )
 
         self._parser.add_argument(
             "--workdir-path",
-            default = DEFAULT_WORKDIR_PATH,
-            help = f"Directory for benchmark files (default: {DEFAULT_WORKDIR_PATH})",
-        )
-
-        self._parser.add_argument(
-            "--questions-path",
-            type = str,
-            default = None,
-            help = "Manual path to benchmark questions JSON file.",
-        )
-
-        self._parser.add_argument(
-            "--db-path",
-            type=str,
-            default = None,
-            help = "Path to SQLite benchmark database.",
+            help="Directory for benchmark downloads. If omitted, a temporary directory is used.",
         )
 
         self._parser.add_argument(
@@ -82,21 +63,18 @@ def _add_args(self)->None:
             default=None,
             help="Path to save evaluation report JSON.",
         )
-        
+
     @staticmethod
     def _execute(args: argparse.Namespace) -> None:
         """Execute the evaluate function with parsed arguments."""
         try:
             evaluate(
                 outputs=args.outputs,
-                version = args.version,
+                version=args.version,
                 workdir_path=args.workdir_path,
-                questions_path=args.questions_path,
-                db_path=args.db_path,
                 save_report=args.save_report,
                 show_mismatches=args.show_mismatches,
                 max_mismatches=args.max_mismatches,
             )
         except Exception as e:
             print(f"Error during evaluation: {e}")
-
@@ -56,9 +56,11 @@ def _add_args(self) -> None:
         def add_common_benchmark_args(parser: argparse.ArgumentParser) -> None:
             parser.add_argument("--version", default="2.0", choices=["1.0", "2.0"])
             parser.add_argument("--output-file", default="llm_sql_predictions.jsonl")
-            parser.add_argument("--questions-path")
-            parser.add_argument("--tables-path")
-            parser.add_argument("--workdir-path", default="./workdir")
+            parser.add_argument(
+                "--workdir-path",
+                default=None,
+                help="Directory for benchmark downloads. If omitted, a temporary directory is used.",
+            )
             parser.add_argument("--num-fewshots", type=int, default=5)
             parser.add_argument("--batch-size", type=int, default=8)
             parser.add_argument("--seed", type=int, default=42)
@@ -217,8 +219,6 @@ def _execute_transformers(args: argparse.Namespace) -> None:
             generation_kwargs=args.generation_kwargs,
             version=args.version,
             output_file=args.output_file,
-            questions_path=args.questions_path,
-            tables_path=args.tables_path,
             workdir_path=args.workdir_path,
             num_fewshots=args.num_fewshots,
             batch_size=args.batch_size,
@@ -243,8 +243,6 @@ def _execute_vllm(args: argparse.Namespace) -> None:
             sampling_kwargs=args.sampling_kwargs,
             version=args.version,
             output_file=args.output_file,
-            questions_path=args.questions_path,
-            tables_path=args.tables_path,
             workdir_path=args.workdir_path,
             limit=args.limit,
             num_fewshots=args.num_fewshots,
@@ -267,8 +265,6 @@ def _execute_api(args: argparse.Namespace) -> None:
             request_headers=args.request_headers,
             version=args.version,
             output_file=args.output_file,
-            questions_path=args.questions_path,
-            tables_path=args.tables_path,
             workdir_path=args.workdir_path,
             limit=args.limit,
             num_fewshots=args.num_fewshots,

@@ -6,7 +6,6 @@
 }
 
 DEFAULT_LLMSQL_VERSION: Literal["1.0", "2.0"] = "2.0"
-DEFAULT_WORKDIR_PATH = "llmsql_workdir"
 
 
 def get_repo_id(version: str = DEFAULT_LLMSQL_VERSION) -> str:

@@ -43,9 +43,7 @@ print(report)
 evaluate(
     outputs,
     *,
-    workdir_path: str | None = "llmsql_workdir",
-    questions_path: str | None = None,
-    db_path: str | None = None,
+    workdir_path: str | None = None,
     save_report: str | None = None,
     show_mismatches: bool = True,
     max_mismatches: int = 5,
@@ -55,9 +53,7 @@ evaluate(
 | Argument          | Description                                                                                                                                     |
 | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | `outputs`         | **Required**. Either a path to a JSONL file or a list of dicts with predictions.                                                                |
-| `workdir_path`    | Directory for automatic download of benchmark files (ignored if both `questions_path` and `db_path` are provided). Default: `"llmsql_workdir"`. |
-| `questions_path`  | Optional path to benchmark questions JSONL file.                                                                                                |
-| `db_path`         | Optional path to SQLite DB with evaluation tables.                                                                                              |
+| `workdir_path`    | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically. |
 | `save_report`     | Optional path to save detailed JSON report. Defaults to `evaluation_results_{uuid}.json`.                                                       |
 | `show_mismatches` | Print mismatches while evaluating. Default: `True`.                                                                                             |
 | `max_mismatches`  | Maximum number of mismatches to print. Default: `5`.                                                                                            |

@@ -9,21 +9,19 @@
 """
 
 from datetime import datetime, timezone
-from pathlib import Path
 import uuid
 
 from rich.progress import track
 
 from llmsql.config.config import (
     DEFAULT_LLMSQL_VERSION,
-    DEFAULT_WORKDIR_PATH,
     get_repo_id,
 )
 from llmsql.utils.evaluation_utils import (
     connect_sqlite,
-    download_benchmark_file,
     evaluate_sample,
 )
+from llmsql.utils.inference_utils import _maybe_download, resolve_workdir_path
 from llmsql.utils.rich_utils import log_mismatch, print_summary
 from llmsql.utils.utils import load_jsonl, load_jsonl_dict_by_key, save_json_report
 
@@ -32,9 +30,7 @@ def evaluate(
     outputs: str | list[dict[int, str | int]],
     *,
     version: str = DEFAULT_LLMSQL_VERSION,
-    workdir_path: str | None = DEFAULT_WORKDIR_PATH,
-    questions_path: str | None = None,
-    db_path: str | None = None,
+    workdir_path: str | None = None,
     save_report: str | None = None,
     show_mismatches: bool = True,
     max_mismatches: int = 5,
@@ -45,9 +41,8 @@ def evaluate(
     Args:
         version: LLMSQL version
         outputs: Either a JSONL file path or a list of dicts.
-        workdir_path: Directory for auto-downloads (ignored if all paths provided).
-        questions_path: Manual path to benchmark questions JSONL.
-        db_path: Manual path to SQLite benchmark DB.
+        workdir_path: Directory to store downloaded benchmark files. If omitted, a
+            temporary directory is created automatically.
         save_report: Optional manual save path. If None → auto-generated.
         show_mismatches: Print mismatches while evaluating.
         max_mismatches: Max mismatches to print.
@@ -58,39 +53,12 @@ def evaluate(
 
     # Determine input type
     input_mode = "jsonl_path" if isinstance(outputs, str) else "dict_list"
+    workdir = resolve_workdir_path(workdir_path)
 
     repo_id = get_repo_id(version)
 
-    # --- Resolve inputs if needed ---
-    workdir = Path(workdir_path) if workdir_path else None
-    if workdir_path is not None and (questions_path is None or db_path is None):
-        workdir.mkdir(parents=True, exist_ok=True)  # type: ignore
-
-    if questions_path is None:
-        if workdir is None:
-            raise ValueError(
-                "questions_path not provided, and workdir_path disabled. "
-                "Enable workdir or provide questions_path explicitly."
-            )
-        local_q = workdir / "questions.jsonl"
-        questions_path = (
-            str(local_q)
-            if local_q.is_file()
-            else download_benchmark_file(repo_id, "questions.jsonl", workdir)
-        )
-
-    if db_path is None:
-        if workdir is None:
-            raise ValueError(
-                "db_path not provided, and workdir_path disabled. "
-                "Enable workdir or provide db_path explicitly."
-            )
-        local_db = workdir / "sqlite_tables.db"
-        db_path = (
-            str(local_db)
-            if local_db.is_file()
-            else download_benchmark_file(repo_id, "sqlite_tables.db", workdir)
-        )
+    questions_path = _maybe_download(repo_id, "questions.jsonl", workdir)
+    db_path = _maybe_download(repo_id, "sqlite_tables.db", workdir)
 
     # --- Load benchmark questions ---
     questions = load_jsonl_dict_by_key(questions_path, key="question_id")

@@ -36,8 +36,7 @@ from llmsql import inference_transformers
 results = inference_transformers(
         model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
         output_file="outputs/preds_transformers.jsonl",
-        questions_path="data/questions.jsonl",
-        tables_path="data/tables.jsonl",
+        workdir_path="./benchmark-cache",
         num_fewshots=5,
         batch_size=8,
         max_new_tokens=256,
@@ -168,9 +167,7 @@ Runs inference using the Hugging Face `transformers` backend.
 | Argument                        | Type    | Default                   | Description                                      |
 | ------------------------------- | ------- | ------------------------- | ------------------------------------------------ |
 | `output_file`                   | `str`   | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL.          |
-| `questions_path`                | `str \| None` | `None`          | Path to questions.jsonl (auto-downloads if missing). |
-| `tables_path`                   | `str \| None` | `None`          | Path to tables.jsonl (auto-downloads if missing).    |
-| `workdir_path`                  | `str`   | `"llmsql_workdir"`        | Working directory for downloaded files.          |
+| `workdir_path`                  | `str \| None`   | `None`        | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically.   |
 | `num_fewshots`                  | `int`   | `5`                       | Number of few-shot examples (0, 1, or 5).        |
 | `batch_size`                    | `int`   | `8`                       | Batch size for inference.                        |
 | `seed`                          | `int`   | `42`                      | Random seed for reproducibility.                 |
@@ -210,9 +207,7 @@ Runs inference using the [vLLM](https://github.com/vllm-project/vllm) backend fo
 | Argument                        | Type           | Default                       | Description                                      |
 | ------------------------------- | -------------- | ----------------------------- | ------------------------------------------------ |
 | `output_file`                   | `str`          | `"outputs/predictions.jsonl"` | Path to write predictions as JSONL.              |
-| `questions_path`                | `str \| None`  | `None`                        | Path to questions.jsonl (auto-downloads if missing). |
-| `tables_path`                   | `str \| None`  | `None`                        | Path to tables.jsonl (auto-downloads if missing).    |
-| `workdir_path`                  | `str`          | `"llmsql_workdir"`            | Working directory for downloaded files.          |
+| `workdir_path`                  | `str \| None`          | `None`            | Directory used to cache downloaded benchmark files. If omitted, a temporary directory is created automatically.          |
 | `num_fewshots`                  | `int`          | `5`                           | Number of few-shot examples (0, 1, or 5).        |
 | `batch_size`                    | `int`          | `8`                           | Number of prompts per batch.                     |
 | `seed`                          | `int`          | `42`                          | Random seed for reproducibility.                 |
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,6 @@ @@
     }
     DEFAULT_LLMSQL_VERSION: Literal["1.0", "2.0"] = "2.0"
-    DEFAULT_WORKDIR_PATH = "llmsql_workdir"
     def get_repo_id(version: str = DEFAULT_LLMSQL_VERSION) -> str:
@@ Expand Down @@