diff --git a/.gitignore b/.gitignore index a3c3066..b74d3e5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,11 @@ llmsql_workdir evaluation_* coverage.xml + +.idea + +# Sphinx build +docs/_build/ +docs/.doctrees/ +*.doctree +*.pickle diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo index f89f358..d7142fb 100644 --- a/docs/_build/html/.buildinfo +++ b/docs/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: f80c99a6f2b64faef91db5c44c3abb2f +config: 3caef0746bc07fabd8f91030ce7b6533 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/.doctrees/docs/evaluation.doctree b/docs/_build/html/.doctrees/docs/evaluation.doctree index c582b67..3dff8be 100644 Binary files a/docs/_build/html/.doctrees/docs/evaluation.doctree and b/docs/_build/html/.doctrees/docs/evaluation.doctree differ diff --git a/docs/_build/html/.doctrees/docs/index.doctree b/docs/_build/html/.doctrees/docs/index.doctree deleted file mode 100644 index 18a85b9..0000000 Binary files a/docs/_build/html/.doctrees/docs/index.doctree and /dev/null differ diff --git a/docs/_build/html/.doctrees/docs/inference.doctree b/docs/_build/html/.doctrees/docs/inference.doctree index c2b53f8..efab5a5 100644 Binary files a/docs/_build/html/.doctrees/docs/inference.doctree and b/docs/_build/html/.doctrees/docs/inference.doctree differ diff --git a/docs/_build/html/.doctrees/docs/usage.doctree b/docs/_build/html/.doctrees/docs/usage.doctree index f239d10..1e56e2c 100644 Binary files a/docs/_build/html/.doctrees/docs/usage.doctree and b/docs/_build/html/.doctrees/docs/usage.doctree differ diff --git a/docs/_build/html/.doctrees/environment.pickle b/docs/_build/html/.doctrees/environment.pickle deleted file mode 100644 index 2bd04f5..0000000 Binary files a/docs/_build/html/.doctrees/environment.pickle and /dev/null differ diff --git a/docs/_build/html/.doctrees/index.doctree b/docs/_build/html/.doctrees/index.doctree index 90c54d1..a9d0e11 100644 Binary files a/docs/_build/html/.doctrees/index.doctree and b/docs/_build/html/.doctrees/index.doctree differ diff --git a/docs/_build/html/_sources/docs/index.rst.txt b/docs/_build/html/_sources/docs/index.rst.txt index ff5bd05..b2760cd 100644 --- a/docs/_build/html/_sources/docs/index.rst.txt +++ b/docs/_build/html/_sources/docs/index.rst.txt @@ -1,7 +1,12 @@ LLMSQL package Documentation ============================ -`← Back to main page <../index.html>`__ +.. raw:: html + + + ← Back to main page + + Welcome to the LLMSQL documentation! This guide covers everything you need to use the project, from running inference diff --git a/docs/_build/html/_sources/docs/usage.rst.txt b/docs/_build/html/_sources/docs/usage.rst.txt index 4468249..806a4e8 100644 --- a/docs/_build/html/_sources/docs/usage.rst.txt +++ b/docs/_build/html/_sources/docs/usage.rst.txt @@ -33,11 +33,11 @@ Using transformers backend. batch_size=8, max_new_tokens=256, temperature=0.7, - model_args={ + model_kwargs={ "attn_implementation": "flash_attention_2", "torch_dtype": "bfloat16", }, - generate_kwargs={ + generation_kwargs={ "do_sample": False, }, ) diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css index 93a4776..4738b2e 100644 --- a/docs/_build/html/_static/basic.css +++ b/docs/_build/html/_static/basic.css @@ -741,14 +741,6 @@ abbr, acronym { cursor: help; } -.translated { - background-color: rgba(207, 255, 207, 0.2) -} - -.untranslated { - background-color: rgba(255, 207, 207, 0.2) -} - /* -- code displays --------------------------------------------------------- */ pre { @@ -911,4 +903,4 @@ div.math:hover a.headerlink { #top-link { display: none; } -} +} \ No newline at end of file diff --git a/docs/_build/html/_static/documentation_options.js b/docs/_build/html/_static/documentation_options.js index 35c998a..eede5b1 100644 --- a/docs/_build/html/_static/documentation_options.js +++ b/docs/_build/html/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.1.13', + VERSION: '0.1.15', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', @@ -10,4 +10,4 @@ const DOCUMENTATION_OPTIONS = { NAVIGATION_WITH_KEYS: false, SHOW_SEARCH_SUMMARY: true, ENABLE_SEARCH_SHORTCUTS: true, -}; \ No newline at end of file +}; diff --git a/docs/_build/html/_static/leaderboard.json b/docs/_build/html/_static/leaderboard.json new file mode 100644 index 0000000..c842948 --- /dev/null +++ b/docs/_build/html/_static/leaderboard.json @@ -0,0 +1,106 @@ +[ + { + "model": "openai/gpt-oss-120b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.9049, + "date": "2026-02-24" + }, + { + "model": "openai/gpt-oss-20b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8871, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8607, + "date": "2026-02-24" + }, + { + "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8519, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-7B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.794, + "date": "2026-02-24" + }, + { + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.7599, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.6401, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5415, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5224, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen3-0.6B", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4983, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4044, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.3727, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-1B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.2678, + "date": "2026-02-24" + } +] \ No newline at end of file diff --git a/docs/_build/html/_static/scripts/front_page.js b/docs/_build/html/_static/scripts/front_page.js index 03fd423..1e0c0dc 100644 --- a/docs/_build/html/_static/scripts/front_page.js +++ b/docs/_build/html/_static/scripts/front_page.js @@ -48,3 +48,88 @@ if (searchInput) { } }); } + +document.addEventListener("DOMContentLoaded", async () => { + const container = document.getElementById('leaderboard-container'); + if (!container) return; + + try { + const response = await fetch('_static/leaderboard.json'); + const rows = await response.json(); + renderLeaderboard(rows); + } catch (e) { + container.innerHTML = '

Error loading leaderboard 😢

'; + console.error(e); + } +}); + +function renderLeaderboard(rows) { + const container = document.getElementById('leaderboard-container'); + container.innerHTML = ''; + + const table = document.createElement('table'); + table.className = 'leaderboard-table'; + + const thead = document.createElement('thead'); + thead.innerHTML = ` + + Rank + Model + Type + Fewshots + Backend + Accuracy + Date + `; + table.appendChild(thead); + + const tbody = document.createElement('tbody'); + rows.forEach((row, i) => { + const tr = document.createElement('tr'); + + // Берём только вторую часть после слеша + const modelName = row.model.includes('/') ? row.model.split('/')[1] : row.model; + + // Модель с ссылкой + const modelCell = document.createElement('td'); + if (row.url) { + const a = document.createElement('a'); + a.href = row.url; + a.target = "_blank"; + a.rel = "noopener"; + a.textContent = modelName; // <-- здесь только вторая часть + modelCell.appendChild(a); + } else { + modelCell.textContent = modelName; + } + + // Accuracy + const accuracyCell = document.createElement('td'); + const barContainer = document.createElement('div'); + barContainer.className = 'accuracy-bar'; + const fill = document.createElement('div'); + fill.className = 'fill'; + fill.style.width = `${(row.accuracy*100).toFixed(2)}%`; + const text = document.createElement('span'); + text.textContent = `${(row.accuracy*100).toFixed(2)}%`; + barContainer.appendChild(fill); + barContainer.appendChild(text); + accuracyCell.appendChild(barContainer); + + // Вставка остальных ячеек + tr.innerHTML += `${i+1}`; + tr.appendChild(modelCell); + tr.innerHTML += ` + ${row.type} + ${row.fewshots} + ${row.backend} + `; + tr.appendChild(accuracyCell); + tr.innerHTML += `${row.date}`; + + tbody.appendChild(tr); + }); + + table.appendChild(tbody); + container.appendChild(table); +} \ No newline at end of file diff --git a/docs/_build/html/_static/searchtools.js b/docs/_build/html/_static/searchtools.js index 2c774d1..91f4be5 100644 --- a/docs/_build/html/_static/searchtools.js +++ b/docs/_build/html/_static/searchtools.js @@ -513,9 +513,11 @@ const Search = { // perform the search on the required terms searchTerms.forEach((word) => { const files = []; + // find documents, if any, containing the query word in their text/title term indices + // use Object.hasOwnProperty to avoid mismatching against prototype properties const arr = [ - { files: terms[word], score: Scorer.term }, - { files: titleTerms[word], score: Scorer.title }, + { files: terms.hasOwnProperty(word) ? terms[word] : undefined, score: Scorer.term }, + { files: titleTerms.hasOwnProperty(word) ? titleTerms[word] : undefined, score: Scorer.title }, ]; // add support for partial matches if (word.length > 2) { @@ -547,8 +549,9 @@ const Search = { // set score for the word in each file recordFiles.forEach((file) => { - if (!scoreMap.has(file)) scoreMap.set(file, {}); - scoreMap.get(file)[word] = record.score; + if (!scoreMap.has(file)) scoreMap.set(file, new Map()); + const fileScores = scoreMap.get(file); + fileScores.set(word, record.score); }); }); @@ -587,7 +590,7 @@ const Search = { break; // select one (max) score for the file. - const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + const score = Math.max(...wordList.map((w) => scoreMap.get(file).get(w))); // add result to the result list results.push([ docNames[file], diff --git a/docs/_build/html/_static/styles/front_page.css b/docs/_build/html/_static/styles/front_page.css index 55bce1c..1d3bcfb 100644 --- a/docs/_build/html/_static/styles/front_page.css +++ b/docs/_build/html/_static/styles/front_page.css @@ -248,3 +248,70 @@ pre span { background: none !important; color: inherit !important; } + +.leaderboard-box { + padding: 1rem; + background: #fff; + border-radius: 12px; + box-shadow: 0 6px 20px rgba(0,0,0,0.08); + overflow-x: auto; +} + +.leaderboard-table { + width: 100%; + border-collapse: collapse; + font-family: 'Inter', 'Roboto', sans-serif; + font-size: 0.95rem; + text-align: center; +} + +.leaderboard-table th { + background: linear-gradient(180deg, #f6f6f6 0%, #e9e9e9 100%); + color: #111827; + font-weight: 600; + font-size: 0.95rem; + padding: 14px 10px; + text-transform: uppercase; + letter-spacing: 0.5px; + border-bottom: 2px solid #ddd; + text-align: center; + box-shadow: inset 0 -1px 0 rgba(0,0,0,0.05); +} + +.leaderboard-table td { + padding: 10px; + border-bottom: 1px solid #e0e0e0; +} + +.leaderboard-table tbody tr:nth-child(even) { + background-color: #f9f9f9; +} + +/* Accuracy bar */ +.accuracy-bar { + position: relative; + width: 100%; + height: 20px; + background: #e0e0e0; + border-radius: 10px; + overflow: hidden; +} + +.accuracy-bar .fill { + height: 100%; + background: linear-gradient(90deg,#4caf50,#81c784); + border-radius: 10px 0 0 10px; +} + +.accuracy-bar span { + position: absolute; + width: 100%; + text-align: center; + top: 0; + left: 0; + font-size: 0.8rem; + font-weight: 600; + line-height: 20px; + color: #000; +} + diff --git a/docs/_build/html/docs/evaluation.html b/docs/_build/html/docs/evaluation.html index e7d2eec..22d13e4 100644 --- a/docs/_build/html/docs/evaluation.html +++ b/docs/_build/html/docs/evaluation.html @@ -5,21 +5,23 @@ - Evaluation API Reference — LLMSQL 0.1.13 documentation + Evaluation API Reference — LLMSQL 0.1.15 documentation - + + - - + + - + + +
+

Evaluation API Reference

The evaluate() function allows you to benchmark Text-to-SQL model outputs @@ -153,37 +155,6 @@

Report Saving -

LLMSQL Evaluation Module

-

Provides the evaluate() function to benchmark Text-to-SQL model outputs -on the LLMSQL benchmark.

-

See the documentation for full usage details.

-

-
-
-llmsql.evaluation.evaluate.evaluate(outputs, *, workdir_path: str | None = 'llmsql_workdir', questions_path: str | None = None, db_path: str | None = None, save_report: str | None = None, show_mismatches: bool = True, max_mismatches: int = 5) dict[source]
-

Evaluate predicted SQL queries against the LLMSQL benchmark.

-
-
Parameters:
-
    -
  • outputs – Either a JSONL file path or a list of dicts.

  • -
  • workdir_path – Directory for auto-downloads (ignored if all paths provided).

  • -
  • questions_path – Manual path to benchmark questions JSONL.

  • -
  • db_path – Manual path to SQLite benchmark DB.

  • -
  • save_report – Optional manual save path. If None → auto-generated.

  • -
  • show_mismatches – Print mismatches while evaluating.

  • -
  • max_mismatches – Max mismatches to print.

  • -
-
-
Returns:
-

Metrics and mismatches.

-
-
Return type:
-

dict

-
-
-
-

💬 Made with ❤️ by the LLMSQL Team @@ -206,11 +177,8 @@

Table of Contents

  • Function Arguments
  • Input Format
  • Output Metrics
  • -
  • Report Saving -
  • +
  • Report Saving
  • +
  • Report Saving
  • @@ -248,15 +216,13 @@

    Navigation

  • index
  • -
  • - modules |
  • previous |
  • - + +
    +

    LLMSQL package Documentation

    -

    ← Back to main page

    -

    Welcome to the LLMSQL documentation! + + ← Back to main page +

    Welcome to the LLMSQL documentation! This guide covers everything you need to use the project, from running inference to evaluating Text-to-SQL models.

    @@ -161,16 +163,13 @@

    Navigation

  • index
  • -
  • - modules |
  • next |
  • previous |
  • - +
    diff --git a/docs/_build/html/docs/inference.html b/docs/_build/html/docs/inference.html index caad396..2aa340f 100644 --- a/docs/_build/html/docs/inference.html +++ b/docs/_build/html/docs/inference.html @@ -5,22 +5,24 @@ - Inference API Reference — LLMSQL 0.1.13 documentation + Inference API Reference — LLMSQL 0.1.15 documentation - + + - - + + - + + +
    -
    -

    Inference API Reference

    -
    -

    LLMSQL Transformers Inference Function

    -

    This module provides a single function inference_transformers() that performs -text-to-SQL generation using large language models via the Transformers backend.

    -

    Example

    -
    from llmsql.inference import inference_transformers
    -
    -results = inference_transformers(
    -    model_or_model_name_or_path="Qwen/Qwen2.5-1.5B-Instruct",
    -    output_file="outputs/preds_transformers.jsonl",
    -    questions_path="data/questions.jsonl",
    -    tables_path="data/tables.jsonl",
    -    num_fewshots=5,
    -    batch_size=8,
    -    max_new_tokens=256,
    -    temperature=0.7,
    -    model_args={
    -        "torch_dtype": "bfloat16",
    -    },
    -    generate_kwargs={
    -        "do_sample": False,
    -    },
    -)
    -
    -
    -

    Notes

    -

    This function uses the HuggingFace Transformers backend and may produce -slightly different outputs than the vLLM backend even with the same inputs -due to differences in implementation and numerical precision.

    -
    -
    -
    -llmsql.inference.inference_transformers.inference_transformers(model_or_model_name_or_path: str | AutoModelForCausalLM, tokenizer_or_name: str | Any | None = None, *, trust_remote_code: bool = True, dtype: dtype = torch.float16, device_map: str | dict[str, int] | None = 'auto', hf_token: str | None = None, model_kwargs: dict[str, Any] | None = None, tokenizer_kwargs: dict[str, Any] | None = None, chat_template: str | None = None, max_new_tokens: int = 256, temperature: float = 0.0, do_sample: bool = False, top_p: float = 1.0, top_k: int = 50, generation_kwargs: dict[str, Any] | None = None, output_file: str = 'llm_sql_predictions.jsonl', questions_path: str | None = None, tables_path: str | None = None, workdir_path: str = 'llmsql_workdir', num_fewshots: int = 5, batch_size: int = 8, seed: int = 42) list[dict[str, str]][source]
    -

    Inference a causal model (Transformers) on the LLMSQL benchmark.

    -
    -
    Parameters:
    -
      -
    • model_or_model_name_or_path – Model object or HF model name/path.

    • -
    • tokenizer_or_name – Tokenizer object or HF tokenizer name/path.

    • -
    • Loading (# Tokenizer)

    • -
    • trust_remote_code – Whether to trust remote code (default: True).

    • -
    • dtype – Torch dtype for model (default: float16).

    • -
    • device_map – Device placement strategy (default: “auto”).

    • -
    • hf_token – Hugging Face authentication token.

    • -
    • model_kwargs – Additional arguments for AutoModelForCausalLM.from_pretrained(). -Note: ‘dtype’, ‘device_map’, ‘trust_remote_code’, ‘token’ -are handled separately and will override values here.

    • -
    • Loading

    • -
    • tokenizer_kwargs – Additional arguments for AutoTokenizer.from_pretrained(). ‘padding_side’ defaults to “left”. -Note: ‘trust_remote_code’, ‘token’ are handled separately and will override values here.

    • -
    • Chat (# Prompt &)

    • -
    • chat_template – Optional chat template to apply before tokenization.

    • -
    • Generation (#)

    • -
    • max_new_tokens – Maximum tokens to generate per sequence.

    • -
    • temperature – Sampling temperature (0.0 = greedy).

    • -
    • do_sample – Whether to use sampling vs greedy decoding.

    • -
    • top_p – Nucleus sampling parameter.

    • -
    • top_k – Top-k sampling parameter.

    • -
    • generation_kwargs – Additional arguments for model.generate(). -Note: ‘max_new_tokens’, ‘temperature’, ‘do_sample’, -‘top_p’, ‘top_k’ are handled separately.

    • -
    • Benchmark (#)

    • -
    • output_file – Output JSONL file path for completions.

    • -
    • questions_path – Path to benchmark questions JSONL.

    • -
    • tables_path – Path to benchmark tables JSONL.

    • -
    • workdir_path – Working directory path.

    • -
    • num_fewshots – Number of few-shot examples (0, 1, or 5).

    • -
    • batch_size – Batch size for inference.

    • -
    • seed – Random seed for reproducibility.

    • -
    -
    -
    Returns:
    -

    List of generated SQL results with metadata.

    -
    -
    -
    +
    +

    Inference API Reference

    +
    +

    Inference API Reference

    -
    -

    LLMSQL vLLM Inference Function

    -

    This module provides a single function inference_vllm() that performs -text-to-SQL generation using large language models via the vLLM backend.

    -

    Example

    -
    from llmsql.inference import inference_vllm
    -
    -results = inference_vllm(
    -    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    -    output_file="outputs/predictions.jsonl",
    -    questions_path="data/questions.jsonl",
    -    tables_path="data/tables.jsonl",
    -    num_fewshots=5,
    -    batch_size=8,
    -    max_new_tokens=256,
    -    temperature=0.7,
    -    tensor_parallel_size=1,
    -)
    -
    -
    -

    Notes

    -

    This function uses the vLLM backend. Outputs may differ from the Transformers -backend due to differences in implementation, batching, and numerical precision.

    -
    -
    -
    -llmsql.inference.inference_vllm.inference_vllm(model_name: str, *, trust_remote_code: bool = True, tensor_parallel_size: int = 1, hf_token: str | None = None, llm_kwargs: dict[str, Any] | None = None, use_chat_template: bool = True, max_new_tokens: int = 256, temperature: float = 1.0, do_sample: bool = True, sampling_kwargs: dict[str, Any] | None = None, output_file: str = 'llm_sql_predictions.jsonl', questions_path: str | None = None, tables_path: str | None = None, workdir_path: str = 'llmsql_workdir', num_fewshots: int = 5, batch_size: int = 8, seed: int = 42) list[dict[str, str]][source]
    -

    Run SQL generation using vLLM.

    -
    -
    Parameters:
    -
      -
    • model_name – Hugging Face model name or path.

    • -
    • Loading (# Model)

    • -
    • trust_remote_code – Whether to trust remote code (default: True).

    • -
    • tensor_parallel_size – Number of GPUs for tensor parallelism (default: 1).

    • -
    • hf_token – Hugging Face authentication token.

    • -
    • llm_kwargs – Additional arguments for vllm.LLM(). -Note: ‘model’, ‘tokenizer’, ‘tensor_parallel_size’, -‘trust_remote_code’ are handled separately and will -override values here.

    • -
    • Generation (#)

    • -
    • max_new_tokens – Maximum tokens to generate per sequence.

    • -
    • temperature – Sampling temperature (0.0 = greedy).

    • -
    • do_sample – Whether to use sampling vs greedy decoding.

    • -
    • sampling_kwargs – Additional arguments for vllm.SamplingParams(). -Note: ‘temperature’, ‘max_tokens’ are handled -separately and will override values here.

    • -
    • Benchmark (#)

    • -
    • output_file – Path to write outputs (will be overwritten).

    • -
    • questions_path – Path to questions.jsonl (auto-downloads if missing).

    • -
    • tables_path – Path to tables.jsonl (auto-downloads if missing).

    • -
    • workdir_path – Directory to store downloaded data.

    • -
    • num_fewshots – Number of few-shot examples (0, 1, or 5).

    • -
    • batch_size – Number of questions per generation batch.

    • -
    • seed – Random seed for reproducibility.

    • -
    -
    -
    Returns:
    -

    List of dicts containing question_id and generated completion.

    -
    -
    -
    -

    @@ -39,81 +39,13 @@

    Navigation

    +

    Index

    - E - | I - | L - | M - -
    -

    E

    - - -
    - -

    I

    - - - -
    - -

    L

    - - - -
      -
    • - llmsql.evaluation.evaluate - -
    • -
    • - llmsql.inference.inference_transformers - -
    • -
      -
    • - llmsql.inference.inference_vllm - -
    • -
    - -

    M

    - - -
    +
    @@ -142,10 +74,7 @@

    Navigation

  • index
  • -
  • - modules |
  • - +
    diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html index c3b94b6..bd4199d 100644 --- a/docs/_build/html/index.html +++ b/docs/_build/html/index.html @@ -10,7 +10,7 @@ LLMSQL Project — Text-to-SQL Benchmark - + @@ -126,13 +126,20 @@

    2️⃣ Inference from CLI

    --output-file outputs/preds.jsonl \ --batch-size 8 \ --temperature 0.9 \ ---generate-kwargs '{"do_sample": false, "top_p": 0.95}' +--generation-kwargs '{"do_sample": false, "top_p": 0.95}'

    3️⃣ Evaluation API (Python)

    -
    from llmsql import LLMSQLEvaluator
    +
    from llmsql import evaluate
    +
    +report =evaluate(outputs="path_to_your_outputs.jsonl")
    +print(report)
    +
    +

    Or with ther results from the infernece:

    +
    from llmsql import evaluate
    +
    +# results = inference_transformers(...) or infernce_vllm(...)
     
    -evaluator = LLMSQLEvaluator()
    -report = evaluator.evaluate(outputs_path="path_to_your_outputs.jsonl")
    +report =evaluate(outputs=results)
     print(report)
     
    @@ -146,15 +153,9 @@ -

    📊 Leaderboard [in progress]

    -
    -

    - The official Leaderboard is currently empty and in progress. - - Submit - - your model results to be the first on the ranking! -

    +

    📊 Leaderboard — Execution Accuracy (EX)

    +
    +

    Loading leaderboard...

    @@ -162,7 +163,7 @@

    📄 Citation

    @inproceedings{llmsql_bench,
       title={LLMSQL: Upgrading WikiSQL for the LLM Era of Text-to-SQL},
       author={Pihulski, Dzmitry and Charchut, Karol and Novogrodskaia, Viktoria and Koco{'n}, Jan},
    -  booktitle={2025 IEEE ICDMW},
    +  booktitle={2025 IEEE ICувцDMW},
       year={2025},
       organization={IEEE}
     }
    @@ -202,4 +203,4 @@ 

    📄 Citation

    - + \ No newline at end of file diff --git a/docs/_build/html/leaderboard.json b/docs/_build/html/leaderboard.json new file mode 100644 index 0000000..c842948 --- /dev/null +++ b/docs/_build/html/leaderboard.json @@ -0,0 +1,106 @@ +[ + { + "model": "openai/gpt-oss-120b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.9049, + "date": "2026-02-24" + }, + { + "model": "openai/gpt-oss-20b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8871, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8607, + "date": "2026-02-24" + }, + { + "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8519, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-7B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.794, + "date": "2026-02-24" + }, + { + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.7599, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.6401, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5415, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5224, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen3-0.6B", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4983, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4044, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.3727, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-1B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.2678, + "date": "2026-02-24" + } +] \ No newline at end of file diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv index 201680c..ef93dc2 100644 Binary files a/docs/_build/html/objects.inv and b/docs/_build/html/objects.inv differ diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html index 212e3b0..1a05821 100644 --- a/docs/_build/html/search.html +++ b/docs/_build/html/search.html @@ -4,18 +4,19 @@ - Search — LLMSQL 0.1.13 documentation + Search — LLMSQL 0.1.15 documentation - + + - + - + - + @@ -25,6 +26,7 @@ + +
    +

    Search

    +
    @@ -89,10 +98,7 @@

    Navigation

  • index
  • -
  • - modules |
  • - +
    diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js index 970540c..f5962dd 100644 --- a/docs/_build/html/searchindex.js +++ b/docs/_build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Basic Example": [[3, "basic-example"]], "Contents": [[1, null]], "Evaluation API Reference": [[0, null]], "Example: Running your first evaluation (with transformers backend)": [[1, "example-running-your-first-evaluation-with-transformers-backend"]], "Features": [[0, "features"]], "Full Documentation": [[1, "full-documentation"]], "Function Arguments": [[0, "function-arguments"]], "Getting Started": [[1, "getting-started"]], "Inference API Reference": [[2, null]], "Input Format": [[0, "input-format"]], "Installation": [[1, "installation"]], "LLMSQL Evaluation Module": [[0, "llmsql-evaluation-module"]], "LLMSQL Transformers Inference Function": [[2, "llmsql-transformers-inference-function"]], "LLMSQL package Documentation": [[1, null]], "LLMSQL vLLM Inference Function": [[2, "llmsql-vllm-inference-function"]], "Output Metrics": [[0, "output-metrics"]], "Report Saving": [[0, "report-saving"]], "Typical workflow": [[3, "typical-workflow"]], "Usage Examples": [[0, "usage-examples"]], "Usage Overview": [[3, null]]}, "docnames": ["docs/evaluation", "docs/index", "docs/inference", "docs/usage", "index"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1}, "filenames": ["docs/evaluation.rst", "docs/index.rst", "docs/inference.rst", "docs/usage.rst", "index.rst"], "indexentries": {"evaluate() (in module llmsql.evaluation.evaluate)": [[0, "llmsql.evaluation.evaluate.evaluate", false]], "llmsql.evaluation.evaluate": [[0, "module-llmsql.evaluation.evaluate", false]], "module": [[0, "module-llmsql.evaluation.evaluate", false]]}, "objects": {"llmsql.evaluation": [[0, 0, 0, "-", "evaluate"]], "llmsql.evaluation.evaluate": [[0, 1, 1, "", "evaluate"]], "llmsql.inference": [[2, 0, 0, "-", "inference_transformers"], [2, 0, 0, "-", "inference_vllm"]], "llmsql.inference.inference_transformers": [[2, 1, 1, "", "inference_transformers"]], "llmsql.inference.inference_vllm": [[2, 1, 1, "", "inference_vllm"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:function"}, "terms": {"0": [1, 2, 3], "1": [0, 1, 2, 3], "2": 0, "256": [1, 2, 3], "3": 0, "30": 0, "4096": 3, "42": 2, "5": [0, 1, 2, 3], "50": 2, "5b": [1, 2, 3], "7": [1, 2, 3], "8": [1, 2, 3], "9": 3, "By": 0, "If": 0, "It": 0, "The": 0, "accuraci": [0, 3], "activ": 0, "addit": 2, "ag": 0, "against": 0, "all": 0, "allow": 0, "ani": 2, "api": 1, "appli": 2, "ar": [0, 2], "argument": 2, "attn_implement": 3, "authent": 2, "auto": [0, 2], "automat": 0, "automodelforcausallm": 2, "autotoken": 2, "back": 1, "backend": [2, 3], "batch": 2, "batch_siz": [1, 2, 3], "befor": 2, "bench": 0, "benchmark": [0, 2], "bfloat16": [1, 2, 3], "bool": [0, 2], "both": 0, "can": 0, "causal": 2, "chat": 2, "chat_templ": 2, "code": 2, "complet": 2, "compon": 3, "comput": 3, "configur": 0, "contain": [0, 2], "count": 0, "cover": 1, "current": 0, "data": [1, 2, 3], "databas": 0, "dataset": 3, "db": 0, "db_path": 0, "decod": 2, "default": [0, 2], "descript": 0, "detail": 0, "devic": 2, "device_map": 2, "dict": [0, 2], "dict_list": 0, "dictionari": 0, "differ": 2, "directori": [0, 2], "displai": 0, "do_sampl": [1, 2, 3], "document": 0, "download": [0, 2], "dtype": 2, "due": 2, "either": 0, "error": 0, "evalu": 3, "evaluation_results_": 0, "even": 2, "everyth": 1, "exact": 0, "exampl": 2, "execut": 0, "face": 2, "fals": [1, 2, 3], "few": 2, "file": [0, 2], "flash_attention_2": 3, "float": 2, "float16": 2, "follow": 0, "from": [0, 1, 2, 3], "from_pretrain": 2, "full": 0, "gener": [0, 2, 3], "generate_kwarg": [1, 2, 3], "generation_kwarg": 2, "gold": 0, "gold_non": 0, "gpu": 2, "gpu_memory_util": 3, "greedi": 2, "guid": 1, "handl": 2, "here": 2, "hf": 2, "hf_token": 2, "how": 0, "hug": 2, "huggingfac": 2, "i": 0, "ignor": 0, "implement": 2, "import": [0, 1, 2, 3], "infer": [1, 3], "inference_transform": [1, 2, 3], "inference_vllm": [2, 3], "input": 2, "input_mod": 0, "inspect": 3, "instruct": [1, 2, 3], "int": [0, 2], "invalid": 0, "json": 0, "jsonl": [0, 1, 2, 3], "jsonl_path": 0, "k": 2, "kei": 0, "languag": 2, "larg": 2, "left": 2, "level": 3, "list": [0, 2], "llm": [2, 3], "llm_kwarg": [2, 3], "llm_sql_predict": 2, "llmsql": 3, "llmsql_workdir": [0, 2], "llmsqlevalu": 3, "load": 2, "log": 0, "made": [0, 1, 2, 3], "mai": 2, "main": 1, "manual": 0, "match": 0, "max": 0, "max_mismatch": 0, "max_model_len": 3, "max_new_token": [1, 2, 3], "max_token": 2, "maximum": [0, 2], "metadata": 2, "metric": 3, "mismatch": 0, "miss": [0, 2], "mode": 0, "model": [0, 1, 2, 3], "model_arg": [1, 2, 3], "model_kwarg": 2, "model_nam": [2, 3], "model_or_model_name_or_path": [1, 2, 3], "modul": 2, "name": [0, 2], "need": 1, "none": [0, 2], "note": 2, "nucleu": 2, "null": 0, "num_fewshot": [1, 2, 3], "number": [0, 2], "numer": 2, "object": 2, "option": [0, 2], "output": [1, 2, 3], "output_fil": [1, 2, 3], "outputs_path": 3, "overal": 0, "overrid": [0, 2], "overview": 1, "overwritten": 2, "own": 0, "packag": 3, "padding_sid": 2, "page": 1, "parallel": 2, "paramet": [0, 2], "pass": 3, "path": [0, 2], "path_to_output": 0, "per": 2, "perform": [2, 3], "pip": 1, "placement": 2, "precis": 2, "pred_non": 0, "predict": [0, 2, 3], "predicted_sql": 0, "preds_transform": [1, 2, 3], "preds_vllm": 3, "primari": 3, "print": [0, 1, 3], "produc": 2, "project": 1, "prompt": 2, "provid": [0, 2, 3], "python": 0, "queri": [0, 3], "question": [0, 1, 2, 3], "question_id": [0, 2], "questions_path": [0, 1, 2, 3], "qwen": [1, 2, 3], "qwen2": [1, 2, 3], "random": 2, "refer": 1, "remot": 2, "report": 3, "reproduc": 2, "requir": 0, "result": [0, 1, 2, 3], "return": [0, 2], "run": [2, 3], "same": 2, "sampl": 2, "sampling_kwarg": 2, "samplingparam": 2, "save_report": 0, "see": 0, "seed": 2, "select": 0, "separ": 2, "sequenc": 2, "shot": 2, "should": 0, "show_mismatch": 0, "singl": 2, "size": 2, "skip": 0, "slightli": 2, "some": 3, "sourc": [0, 1, 2], "sql": [0, 1, 2, 3], "sql_error": 0, "sqlite": 0, "sqlite_t": 0, "store": 2, "str": [0, 2], "strategi": 2, "summari": 0, "support": 0, "tabl": [0, 1, 2, 3], "tables_path": [1, 2, 3], "take": 3, "task": 3, "team": [0, 1, 2, 3], "temperatur": [1, 2, 3], "templat": 2, "tensor": 2, "tensor_parallel_s": [2, 3], "text": [0, 1, 2], "than": 2, "thi": [0, 1, 2], "time": 3, "timestamp": 0, "token": 2, "tokenizer_kwarg": 2, "tokenizer_or_nam": 2, "top": 2, "top_k": 2, "top_p": 2, "torch": 2, "torch_dtyp": [1, 2, 3], "total": 0, "transform": 3, "true": [0, 2], "trust": 2, "trust_remote_cod": 2, "two": 3, "type": 0, "us": [0, 1, 2, 3], "usag": 1, "use_chat_templ": 2, "uuid": 0, "v": 2, "valu": 2, "via": 2, "vllm": 3, "wa": 0, "welcom": 1, "were": 0, "where": 0, "whether": 2, "while": 0, "work": 2, "workdir": 0, "workdir_path": [0, 2], "write": 2, "you": [0, 1], "your": 0}, "titles": ["Evaluation API Reference", "LLMSQL package Documentation", "Inference API Reference", "Usage Overview", "<no title>"], "titleterms": {"api": [0, 2], "argument": 0, "backend": 1, "basic": 3, "content": 1, "document": 1, "evalu": [0, 1], "exampl": [0, 1, 3], "featur": 0, "first": 1, "format": 0, "full": 1, "function": [0, 2], "get": 1, "infer": 2, "input": 0, "instal": 1, "llmsql": [0, 1, 2], "metric": 0, "modul": 0, "output": 0, "overview": 3, "packag": 1, "refer": [0, 2], "report": 0, "run": 1, "save": 0, "start": 1, "transform": [1, 2], "typic": 3, "usag": [0, 3], "vllm": 2, "workflow": 3, "your": 1}}) +Search.setIndex({"alltitles":{"Basic Example":[[3,"basic-example"]],"Contents":[[1,null]],"Evaluation API Reference":[[0,null]],"Example: Running your first evaluation (with transformers backend)":[[1,"example-running-your-first-evaluation-with-transformers-backend"]],"Features":[[0,"features"]],"Full Documentation":[[1,"full-documentation"]],"Function Arguments":[[0,"function-arguments"]],"Getting Started":[[1,"getting-started"]],"Inference API Reference":[[2,null]],"Input Format":[[0,"input-format"]],"Installation":[[1,"installation"]],"LLMSQL package Documentation":[[1,null]],"Output Metrics":[[0,"output-metrics"]],"Report Saving":[[0,"report-saving"]],"Typical workflow":[[3,"typical-workflow"]],"Usage Examples":[[0,"usage-examples"]],"Usage Overview":[[3,null]]},"docnames":["docs/evaluation","docs/index","docs/inference","docs/usage","index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.viewcode":1},"filenames":["docs\\evaluation.rst","docs\\index.rst","docs\\inference.rst","docs\\usage.rst","index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"0":[1,3],"1":[0,1,3],"2":0,"256":[1,3],"3":0,"30":0,"4096":3,"5":[0,1,3],"5b":[1,3],"7":[1,3],"8":[1,3],"9":3,"By":0,"It":0,"The":0,"accuraci":[0,3],"activ":0,"ag":0,"against":0,"allow":0,"api":1,"ar":0,"attn_implement":3,"automat":0,"back":1,"backend":3,"batch_siz":[1,3],"bench":0,"benchmark":0,"bfloat16":[1,3],"both":0,"can":0,"compon":3,"comput":3,"configur":0,"contain":0,"count":0,"cover":1,"current":0,"data":[1,3],"databas":0,"dataset":3,"db":0,"db_path":0,"default":0,"descript":0,"detail":0,"dict":0,"dict_list":0,"dictionari":0,"directori":0,"displai":0,"do_sampl":[1,3],"download":0,"error":0,"evalu":3,"evaluation_results_":0,"everyth":1,"exact":0,"execut":0,"fals":[1,3],"file":0,"flash_attention_2":3,"follow":0,"from":[0,1,3],"gener":3,"generate_kwarg":1,"generation_kwarg":3,"gold":0,"gold_non":0,"gpu_memory_util":3,"guid":1,"how":0,"i":0,"ignor":0,"import":[0,1,3],"infer":[1,3],"inference_transform":[1,3],"inference_vllm":3,"input_mod":0,"inspect":3,"instruct":[1,3],"invalid":0,"json":0,"jsonl":[0,1,3],"jsonl_path":0,"kei":0,"level":3,"list":0,"llm":3,"llm_kwarg":3,"llmsql":[0,2,3],"llmsql_workdir":0,"llmsqlevalu":3,"log":0,"made":[0,1,2,3],"main":1,"match":0,"max_mismatch":0,"max_model_len":3,"max_new_token":[1,3],"maximum":0,"metric":3,"mismatch":0,"miss":0,"mode":0,"model":[0,1,3],"model_arg":1,"model_kwarg":3,"model_nam":3,"model_or_model_name_or_path":[1,3],"name":0,"need":1,"none":0,"null":0,"num_fewshot":[1,3],"number":0,"option":0,"output":[1,3],"output_fil":[1,3],"outputs_path":3,"overal":0,"overrid":0,"overview":1,"own":0,"packag":3,"page":1,"pass":3,"path":0,"path_to_output":0,"perform":3,"pip":1,"pred_non":0,"predict":[0,3],"predicted_sql":0,"preds_transform":[1,3],"preds_vllm":3,"primari":3,"print":[0,1,3],"project":1,"provid":[0,3],"python":0,"queri":[0,3],"question":[0,1,3],"question_id":0,"questions_path":[0,1,3],"qwen":[1,3],"qwen2":[1,3],"refer":1,"report":3,"requir":0,"result":[0,1,3],"return":0,"run":3,"save_report":0,"select":0,"should":0,"show_mismatch":0,"skip":0,"some":3,"sourc":1,"sql":[0,1,3],"sql_error":0,"sqlite":0,"sqlite_t":0,"summari":0,"support":0,"tabl":[0,1,3],"tables_path":[1,3],"take":3,"task":3,"team":[0,1,2,3],"temperatur":[1,3],"tensor_parallel_s":3,"text":[0,1],"thi":[0,1],"time":3,"timestamp":0,"torch_dtyp":[1,3],"total":0,"transform":3,"true":0,"two":3,"us":[0,1,3],"usag":1,"uuid":0,"vllm":3,"wa":0,"welcom":1,"were":0,"where":0,"while":0,"workdir":0,"workdir_path":0,"you":[0,1],"your":0},"titles":["Evaluation API Reference","LLMSQL package Documentation","Inference API Reference","Usage Overview","<no title>"],"titleterms":{"api":[0,2],"argument":0,"backend":1,"basic":3,"content":1,"document":1,"evalu":[0,1],"exampl":[0,1,3],"featur":0,"first":1,"format":0,"full":1,"function":0,"get":1,"infer":2,"input":0,"instal":1,"llmsql":1,"metric":0,"output":0,"overview":3,"packag":1,"refer":[0,2],"report":0,"run":1,"save":0,"start":1,"transform":1,"typic":3,"usag":[0,3],"workflow":3,"your":1}}) \ No newline at end of file diff --git a/docs/_static/leaderboard.json b/docs/_static/leaderboard.json new file mode 100644 index 0000000..c842948 --- /dev/null +++ b/docs/_static/leaderboard.json @@ -0,0 +1,106 @@ +[ + { + "model": "openai/gpt-oss-120b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.9049, + "date": "2026-02-24" + }, + { + "model": "openai/gpt-oss-20b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8871, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8607, + "date": "2026-02-24" + }, + { + "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8519, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-7B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.794, + "date": "2026-02-24" + }, + { + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.7599, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.6401, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5415, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5224, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen3-0.6B", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4983, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4044, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.3727, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-1B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.2678, + "date": "2026-02-24" + } +] \ No newline at end of file diff --git a/docs/_static/scripts/front_page.js b/docs/_static/scripts/front_page.js index 03fd423..4f2cdc5 100644 --- a/docs/_static/scripts/front_page.js +++ b/docs/_static/scripts/front_page.js @@ -48,3 +48,86 @@ if (searchInput) { } }); } + +document.addEventListener("DOMContentLoaded", async () => { + const container = document.getElementById('leaderboard-container'); + if (!container) return; + + try { + const response = await fetch('_static/leaderboard.json'); + const rows = await response.json(); + renderLeaderboard(rows); + } catch (e) { + container.innerHTML = '

    Error loading leaderboard 😢

    '; + console.error(e); + } +}); + +function renderLeaderboard(rows) { + const container = document.getElementById('leaderboard-container'); + container.innerHTML = ''; + + const table = document.createElement('table'); + table.className = 'leaderboard-table'; + + const thead = document.createElement('thead'); + thead.innerHTML = ` + + Rank + Model + Type + Fewshots + Backend + Accuracy + Date + `; + table.appendChild(thead); + + const tbody = document.createElement('tbody'); + rows.forEach((row, i) => { + const tr = document.createElement('tr'); + + const modelName = row.model.includes('/') ? row.model.split('/')[1] : row.model; + + const modelCell = document.createElement('td'); + if (row.url) { + const a = document.createElement('a'); + a.href = row.url; + a.target = "_blank"; + a.rel = "noopener"; + a.textContent = modelName; // <-- здесь только вторая часть + modelCell.appendChild(a); + } else { + modelCell.textContent = modelName; + } + + // Accuracy + const accuracyCell = document.createElement('td'); + const barContainer = document.createElement('div'); + barContainer.className = 'accuracy-bar'; + const fill = document.createElement('div'); + fill.className = 'fill'; + fill.style.width = `${(row.accuracy*100).toFixed(2)}%`; + const text = document.createElement('span'); + text.textContent = `${(row.accuracy*100).toFixed(2)}%`; + barContainer.appendChild(fill); + barContainer.appendChild(text); + accuracyCell.appendChild(barContainer); + + + tr.innerHTML += `${i+1}`; + tr.appendChild(modelCell); + tr.innerHTML += ` + ${row.type} + ${row.fewshots} + ${row.backend} + `; + tr.appendChild(accuracyCell); + tr.innerHTML += `${row.date}`; + + tbody.appendChild(tr); + }); + + table.appendChild(tbody); + container.appendChild(table); +} \ No newline at end of file diff --git a/docs/_static/styles/front_page.css b/docs/_static/styles/front_page.css index 55bce1c..1d3bcfb 100644 --- a/docs/_static/styles/front_page.css +++ b/docs/_static/styles/front_page.css @@ -248,3 +248,70 @@ pre span { background: none !important; color: inherit !important; } + +.leaderboard-box { + padding: 1rem; + background: #fff; + border-radius: 12px; + box-shadow: 0 6px 20px rgba(0,0,0,0.08); + overflow-x: auto; +} + +.leaderboard-table { + width: 100%; + border-collapse: collapse; + font-family: 'Inter', 'Roboto', sans-serif; + font-size: 0.95rem; + text-align: center; +} + +.leaderboard-table th { + background: linear-gradient(180deg, #f6f6f6 0%, #e9e9e9 100%); + color: #111827; + font-weight: 600; + font-size: 0.95rem; + padding: 14px 10px; + text-transform: uppercase; + letter-spacing: 0.5px; + border-bottom: 2px solid #ddd; + text-align: center; + box-shadow: inset 0 -1px 0 rgba(0,0,0,0.05); +} + +.leaderboard-table td { + padding: 10px; + border-bottom: 1px solid #e0e0e0; +} + +.leaderboard-table tbody tr:nth-child(even) { + background-color: #f9f9f9; +} + +/* Accuracy bar */ +.accuracy-bar { + position: relative; + width: 100%; + height: 20px; + background: #e0e0e0; + border-radius: 10px; + overflow: hidden; +} + +.accuracy-bar .fill { + height: 100%; + background: linear-gradient(90deg,#4caf50,#81c784); + border-radius: 10px 0 0 10px; +} + +.accuracy-bar span { + position: absolute; + width: 100%; + text-align: center; + top: 0; + left: 0; + font-size: 0.8rem; + font-weight: 600; + line-height: 20px; + color: #000; +} + diff --git a/docs/_templates/index.html b/docs/_templates/index.html index 65bcb22..a93b256 100644 --- a/docs/_templates/index.html +++ b/docs/_templates/index.html @@ -153,15 +153,9 @@ -

    📊 Leaderboard [in progress]

    -
    -

    - The official Leaderboard is currently empty and in progress. - - Submit - - your model results to be the first on the ranking! -

    +

    📊 Leaderboard — Execution Accuracy (EX)

    +
    +

    Loading leaderboard...

    @@ -169,7 +163,7 @@

    📄 Citation

    @inproceedings{llmsql_bench,
       title={LLMSQL: Upgrading WikiSQL for the LLM Era of Text-to-SQL},
       author={Pihulski, Dzmitry and Charchut, Karol and Novogrodskaia, Viktoria and Koco{'n}, Jan},
    -  booktitle={2025 IEEE ICDMW},
    +  booktitle={2025 IEEE ICувцDMW},
       year={2025},
       organization={IEEE}
     }
    diff --git a/docs/docs/index.rst b/docs/docs/index.rst
    index ff5bd05..b2760cd 100644
    --- a/docs/docs/index.rst
    +++ b/docs/docs/index.rst
    @@ -1,7 +1,12 @@
     LLMSQL package Documentation
     ============================
     
    -`← Back to main page <../index.html>`__
    +.. raw:: html
    +
    +   
    +     ← Back to main page
    +   
    +
     
     Welcome to the LLMSQL documentation!
     This guide covers everything you need to use the project, from running inference
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..1049e44
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..7aaea2e
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.2-1B-Instruct
    +  revision: main
    +  commit_hash: 9213176726f574b556790deb65791e0c5aa438b6
    +  parameter_count: 1B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.2678
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-1B-Instruct/5fewshots/Llama-3.2-1B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..a3400c8
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..57616a3
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.2-3B-Instruct
    +  revision: main
    +  commit_hash: 0cb88a4f764b7a12671c53f0838cd831a0843b95
    +  parameter_count: 3B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.5415
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-3B-Instruct/5fewshots/Llama-3.2-3B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..4efc97e
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..1e966f9
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.3-70B-Instruct
    +  revision: main
    +  commit_hash: 6f6073b423013f6a7d4d9f39144961bfbfbc386b
    +  parameter_count: 70B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8607
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.3-70B-Instruct/5fewshots/Llama-3.3-70B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..ef0562c
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "mistralai/Mistral-Nemo-Instruct-2407"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
    new file mode 100644
    index 0000000..7914a99
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: mistralai/Mistral-Nemo-Instruct-2407
    +  revision: main
    +  commit_hash: 04d8a90549d23fc6bd7f642064003592df51e9b3
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.7599
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Mistral-Nemo-Instruct-2407/5fewshots/Mistral-Nemo-Instruct-2407_outputs.jsonl
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..fa7cfb7
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
    new file mode 100644
    index 0000000..a2d5154
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
    @@ -0,0 +1,59 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
    +  revision: main
    +  commit_hash: 5a48de7e98cce824b3456eb9857ded839c3b6475
    +  parameter_count: 30B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8519
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16_outputs.jsonl
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..59bfa27
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-chat"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
    new file mode 100644
    index 0000000..6a2ab5f
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/PLLuM-12B-chat
    +  revision: main
    +  commit_hash: 74d80ff96552d9555f6f6f28321433da3895d2ec
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.5224
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-chat/5fewshots/PLLuM-12B-chat_outputs.jsonl
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..372e696
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-nc-chat"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
    new file mode 100644
    index 0000000..819021e
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/PLLuM-12B-nc-chat
    +  revision: main
    +  commit_hash: 7089352cfc2efbd2d3c64cc8cd5c97cd2c4fc013
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.4044
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-nc-chat/5fewshots/PLLuM-12B-nc-chat_outputs.jsonl
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..c8af571
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..47c8f25
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen2.5-1.5B-Instruct
    +  revision: main
    +  commit_hash: 989aa7980e4cf806f80c7fef2b1adb7bc71aa306
    +  parameter_count: 1.5B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.6401
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-1.5B-Instruct/5fewshots/Qwen2.5-1.5B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..e463467
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..0492a50
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen2.5-7B-Instruct
    +  revision: main
    +  commit_hash: a09a35458c702b33eeacc393d103063234e8bc28
    +  parameter_count: 7B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.7940
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-7B-Instruct/5fewshots/Qwen2.5-7B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..30b281f
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen3-0.6B"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=0.6,
    +    sampling_kwargs={"top_p": 0.95, "top_k": 20, "min_p": 0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/run.yaml b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
    new file mode 100644
    index 0000000..ba714f8
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
    @@ -0,0 +1,60 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen3-0.6B
    +  revision: main
    +  commit_hash: c1899de289a04d12100db370d81485cdf75e47ca
    +  parameter_count: 0.6B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 0.6
    +    sampling_kwargs:
    +      top_p: 0.95
    +      top_k: 20
    +      min_p: 0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.4983
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen3-0.6B/5fewshots/Qwen3-0.6B_outputs.jsonl
    diff --git a/leaderboard/docs/_build/html/_static/leaderboard.json b/leaderboard/docs/_build/html/_static/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/docs/_build/html/_static/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/docs/_static/leaderboard.json b/leaderboard/docs/_static/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/docs/_static/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/generate_leaderboard.py b/leaderboard/generate_leaderboard.py
    new file mode 100644
    index 0000000..b795556
    --- /dev/null
    +++ b/leaderboard/generate_leaderboard.py
    @@ -0,0 +1,34 @@
    +import yaml
    +import json
    +from pathlib import Path
    +import shutil
    +
    +BASE_DIR = Path(__file__).parent
    +DOCS_DIR = Path(__file__).parent.parent / "docs/_static"
    +BUILD_DIR = DOCS_DIR / "_build/html"
    +
    +rows = []
    +
    +for path in BASE_DIR.rglob("run.yaml"):
    +    with open(path) as f:
    +        data = yaml.safe_load(f)
    +
    +    rows.append({
    +        "model": data["model"]["name"],
    +        "type": data.get("type", ""),
    +        "fewshots": data["inference"]["arguments"]["num_fewshots"],
    +        "backend": data["inference"]["backend"],
    +        "accuracy": data["results"]["execution_accuracy"],
    +        "date": str(data["date"]),
    +    })
    +
    +rows.sort(key=lambda x: x["accuracy"], reverse=True)
    +
    +json_file = DOCS_DIR / "leaderboard.json"
    +with open(json_file, "w") as f:
    +    json.dump(rows, f, indent=2)
    +
    +if BUILD_DIR.exists():
    +    shutil.copy(json_file, BUILD_DIR / "leaderboard.json")
    +
    +print(f"✅ leaderboard.json in {json_file}")
    \ No newline at end of file
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/inference_script.py b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..e99d509
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "openai/gpt-oss-120b"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/requirements.txt b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/run.yaml b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
    new file mode 100644
    index 0000000..fe878e3
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
    @@ -0,0 +1,60 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: openai/gpt-oss-120b
    +  revision: main
    +  commit_hash: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
    +  parameter_count: 120B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 0.95
    +      repetition_penalty: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.9049
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-120b/5fewshots/gpt-oss-120b_outputs.jsonl
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/inference_script.py b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..3125855
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "openai/gpt-oss-20b"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/requirements.txt b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/run.yaml b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
    new file mode 100644
    index 0000000..74b6638
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
    @@ -0,0 +1,59 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: openai/gpt-oss-20b
    +  revision: main
    +  commit_hash: 6cee5e81ee83917806bbde320786a8fb61efebee
    +  parameter_count: 20B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 0.95
    +      repetition_penalty: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8871
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-20b/5fewshots/gpt-oss-20b_outputs.jsonl
    diff --git a/leaderboard/leaderboard.json b/leaderboard/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..c2e8457
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/pllum-12b-nc-chat-250715"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
    new file mode 100644
    index 0000000..6b48d39
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/pllum-12b-nc-chat-250715
    +  revision: main
    +  commit_hash: 025e26b3fc5ac1fa8714298e671a6cf2418123d7
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.3727
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/pllum-12b-nc-chat-250715/5fewshots/pllum-12b-nc-chat-250715_outputs.jsonl