diff --git a/.gitignore b/.gitignore index a3c3066..b74d3e5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,11 @@ llmsql_workdir evaluation_* coverage.xml + +.idea + +# Sphinx build +docs/_build/ +docs/.doctrees/ +*.doctree +*.pickle diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo index b853fd5..d7142fb 100644 --- a/docs/_build/html/.buildinfo +++ b/docs/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: ba6688d44e6ba22fb6e40076d1af75c2 +config: 3caef0746bc07fabd8f91030ce7b6533 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/.doctrees/docs/index.doctree b/docs/_build/html/.doctrees/docs/index.doctree deleted file mode 100644 index eb4bb1d..0000000 Binary files a/docs/_build/html/.doctrees/docs/index.doctree and /dev/null differ diff --git a/docs/_build/html/.doctrees/environment.pickle b/docs/_build/html/.doctrees/environment.pickle deleted file mode 100644 index 263655d..0000000 Binary files a/docs/_build/html/.doctrees/environment.pickle and /dev/null differ diff --git a/docs/_build/html/_static/documentation_options.js b/docs/_build/html/_static/documentation_options.js index 82d487f..eede5b1 100644 --- a/docs/_build/html/_static/documentation_options.js +++ b/docs/_build/html/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.1.14', + VERSION: '0.1.15', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', @@ -10,4 +10,4 @@ const DOCUMENTATION_OPTIONS = { NAVIGATION_WITH_KEYS: false, SHOW_SEARCH_SUMMARY: true, ENABLE_SEARCH_SHORTCUTS: true, -}; \ No newline at end of file +}; diff --git a/docs/_build/html/_static/leaderboard.json b/docs/_build/html/_static/leaderboard.json new file mode 100644 index 0000000..c842948 --- /dev/null +++ b/docs/_build/html/_static/leaderboard.json @@ -0,0 +1,106 @@ +[ + { + "model": "openai/gpt-oss-120b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.9049, + "date": "2026-02-24" + }, + { + "model": "openai/gpt-oss-20b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8871, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8607, + "date": "2026-02-24" + }, + { + "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8519, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-7B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.794, + "date": "2026-02-24" + }, + { + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.7599, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.6401, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5415, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5224, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen3-0.6B", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4983, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4044, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.3727, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-1B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.2678, + "date": "2026-02-24" + } +] \ No newline at end of file diff --git a/docs/_build/html/_static/scripts/front_page.js b/docs/_build/html/_static/scripts/front_page.js index 03fd423..1e0c0dc 100644 --- a/docs/_build/html/_static/scripts/front_page.js +++ b/docs/_build/html/_static/scripts/front_page.js @@ -48,3 +48,88 @@ if (searchInput) { } }); } + +document.addEventListener("DOMContentLoaded", async () => { + const container = document.getElementById('leaderboard-container'); + if (!container) return; + + try { + const response = await fetch('_static/leaderboard.json'); + const rows = await response.json(); + renderLeaderboard(rows); + } catch (e) { + container.innerHTML = '

Error loading leaderboard 😢

'; + console.error(e); + } +}); + +function renderLeaderboard(rows) { + const container = document.getElementById('leaderboard-container'); + container.innerHTML = ''; + + const table = document.createElement('table'); + table.className = 'leaderboard-table'; + + const thead = document.createElement('thead'); + thead.innerHTML = ` + + Rank + Model + Type + Fewshots + Backend + Accuracy + Date + `; + table.appendChild(thead); + + const tbody = document.createElement('tbody'); + rows.forEach((row, i) => { + const tr = document.createElement('tr'); + + // Берём только вторую часть после слеша + const modelName = row.model.includes('/') ? row.model.split('/')[1] : row.model; + + // Модель с ссылкой + const modelCell = document.createElement('td'); + if (row.url) { + const a = document.createElement('a'); + a.href = row.url; + a.target = "_blank"; + a.rel = "noopener"; + a.textContent = modelName; // <-- здесь только вторая часть + modelCell.appendChild(a); + } else { + modelCell.textContent = modelName; + } + + // Accuracy + const accuracyCell = document.createElement('td'); + const barContainer = document.createElement('div'); + barContainer.className = 'accuracy-bar'; + const fill = document.createElement('div'); + fill.className = 'fill'; + fill.style.width = `${(row.accuracy*100).toFixed(2)}%`; + const text = document.createElement('span'); + text.textContent = `${(row.accuracy*100).toFixed(2)}%`; + barContainer.appendChild(fill); + barContainer.appendChild(text); + accuracyCell.appendChild(barContainer); + + // Вставка остальных ячеек + tr.innerHTML += `${i+1}`; + tr.appendChild(modelCell); + tr.innerHTML += ` + ${row.type} + ${row.fewshots} + ${row.backend} + `; + tr.appendChild(accuracyCell); + tr.innerHTML += `${row.date}`; + + tbody.appendChild(tr); + }); + + table.appendChild(tbody); + container.appendChild(table); +} \ No newline at end of file diff --git a/docs/_build/html/_static/styles/front_page.css b/docs/_build/html/_static/styles/front_page.css index 55bce1c..1d3bcfb 100644 --- a/docs/_build/html/_static/styles/front_page.css +++ b/docs/_build/html/_static/styles/front_page.css @@ -248,3 +248,70 @@ pre span { background: none !important; color: inherit !important; } + +.leaderboard-box { + padding: 1rem; + background: #fff; + border-radius: 12px; + box-shadow: 0 6px 20px rgba(0,0,0,0.08); + overflow-x: auto; +} + +.leaderboard-table { + width: 100%; + border-collapse: collapse; + font-family: 'Inter', 'Roboto', sans-serif; + font-size: 0.95rem; + text-align: center; +} + +.leaderboard-table th { + background: linear-gradient(180deg, #f6f6f6 0%, #e9e9e9 100%); + color: #111827; + font-weight: 600; + font-size: 0.95rem; + padding: 14px 10px; + text-transform: uppercase; + letter-spacing: 0.5px; + border-bottom: 2px solid #ddd; + text-align: center; + box-shadow: inset 0 -1px 0 rgba(0,0,0,0.05); +} + +.leaderboard-table td { + padding: 10px; + border-bottom: 1px solid #e0e0e0; +} + +.leaderboard-table tbody tr:nth-child(even) { + background-color: #f9f9f9; +} + +/* Accuracy bar */ +.accuracy-bar { + position: relative; + width: 100%; + height: 20px; + background: #e0e0e0; + border-radius: 10px; + overflow: hidden; +} + +.accuracy-bar .fill { + height: 100%; + background: linear-gradient(90deg,#4caf50,#81c784); + border-radius: 10px 0 0 10px; +} + +.accuracy-bar span { + position: absolute; + width: 100%; + text-align: center; + top: 0; + left: 0; + font-size: 0.8rem; + font-weight: 600; + line-height: 20px; + color: #000; +} + diff --git a/docs/_build/html/docs/evaluation.html b/docs/_build/html/docs/evaluation.html index 8f68b02..22d13e4 100644 --- a/docs/_build/html/docs/evaluation.html +++ b/docs/_build/html/docs/evaluation.html @@ -5,21 +5,23 @@ - Evaluation API Reference — LLMSQL 0.1.14 documentation + Evaluation API Reference — LLMSQL 0.1.15 documentation + - - + + - + - + + + +
- + +

Evaluation API Reference

The evaluate() function allows you to benchmark Text-to-SQL model outputs @@ -173,6 +178,7 @@

Table of Contents

  • Input Format
  • Output Metrics
  • Report Saving
  • +
  • Report Saving
  • @@ -213,12 +219,13 @@

    Navigation

  • previous |
  • - + - + +
    - \ No newline at end of file + diff --git a/docs/_build/html/docs/index.html b/docs/_build/html/docs/index.html index 9c50489..9c26bf9 100644 --- a/docs/_build/html/docs/index.html +++ b/docs/_build/html/docs/index.html @@ -5,22 +5,24 @@ - LLMSQL package Documentation — LLMSQL 0.1.14 documentation + LLMSQL package Documentation — LLMSQL 0.1.15 documentation + - - + + - + - + + +
    +
    - \ No newline at end of file + diff --git a/docs/_build/html/docs/inference.html b/docs/_build/html/docs/inference.html index 61c12bb..2aa340f 100644 --- a/docs/_build/html/docs/inference.html +++ b/docs/_build/html/docs/inference.html @@ -5,22 +5,24 @@ - Inference API Reference — LLMSQL 0.1.14 documentation + Inference API Reference — LLMSQL 0.1.15 documentation + - - + + - + - + + +
    +
    - + +
    +

    Inference API Reference

    +

    Inference API Reference

    @@ -103,12 +110,13 @@

    Navigation

  • previous |
  • - + - + +
    - \ No newline at end of file + diff --git a/docs/_build/html/docs/usage.html b/docs/_build/html/docs/usage.html index 643646c..2e73e7a 100644 --- a/docs/_build/html/docs/usage.html +++ b/docs/_build/html/docs/usage.html @@ -5,22 +5,24 @@ - Usage Overview — LLMSQL 0.1.14 documentation + Usage Overview — LLMSQL 0.1.15 documentation + - - + + - + - + + +
    +
    - \ No newline at end of file + diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html index 73f757d..2caaec9 100644 --- a/docs/_build/html/genindex.html +++ b/docs/_build/html/genindex.html @@ -4,20 +4,22 @@ - Index — LLMSQL 0.1.14 documentation + Index — LLMSQL 0.1.15 documentation + - - + + - + - + + +
    +
    - + +

    Index

    - + +
    @@ -69,11 +74,11 @@

    Navigation

  • index
  • - - + +
    - \ No newline at end of file + diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html index e77de67..bd4199d 100644 --- a/docs/_build/html/index.html +++ b/docs/_build/html/index.html @@ -153,15 +153,9 @@ -

    📊 Leaderboard [in progress]

    -
    -

    - The official Leaderboard is currently empty and in progress. - - Submit - - your model results to be the first on the ranking! -

    +

    📊 Leaderboard — Execution Accuracy (EX)

    +
    +

    Loading leaderboard...

    @@ -169,7 +163,7 @@

    📄 Citation

    @inproceedings{llmsql_bench,
       title={LLMSQL: Upgrading WikiSQL for the LLM Era of Text-to-SQL},
       author={Pihulski, Dzmitry and Charchut, Karol and Novogrodskaia, Viktoria and Koco{'n}, Jan},
    -  booktitle={2025 IEEE ICDMW},
    +  booktitle={2025 IEEE ICувцDMW},
       year={2025},
       organization={IEEE}
     }
    diff --git a/docs/_build/html/leaderboard.json b/docs/_build/html/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/docs/_build/html/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html
    index b536f93..1a05821 100644
    --- a/docs/_build/html/search.html
    +++ b/docs/_build/html/search.html
    @@ -4,18 +4,19 @@
       
         
         
    -    Search — LLMSQL 0.1.14 documentation
    +    Search — LLMSQL 0.1.15 documentation
         
         
    +    
         
    -    
    -    
    -    
    +    
    +
    +    
         
         
         
         
    -    
    +    
         
         
         
    @@ -23,7 +24,8 @@
         
         
         
    -     
    +
    +
     
       
           
    +    
    +
    - + +

    Search

    - + + - - + + + +

    Searching for multiple words only shows matches that contain all words.

    - - + + + +
    - - + + + +
    - + +
    @@ -86,11 +98,11 @@

    Navigation

  • index
  • - - + +
    - \ No newline at end of file + diff --git a/docs/_static/leaderboard.json b/docs/_static/leaderboard.json new file mode 100644 index 0000000..c842948 --- /dev/null +++ b/docs/_static/leaderboard.json @@ -0,0 +1,106 @@ +[ + { + "model": "openai/gpt-oss-120b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.9049, + "date": "2026-02-24" + }, + { + "model": "openai/gpt-oss-20b", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8871, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.3-70B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8607, + "date": "2026-02-24" + }, + { + "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.8519, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-7B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.794, + "date": "2026-02-24" + }, + { + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.7599, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.6401, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5415, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.5224, + "date": "2026-02-24" + }, + { + "model": "Qwen/Qwen3-0.6B", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4983, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.4044, + "date": "2026-02-24" + }, + { + "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.3727, + "date": "2026-02-24" + }, + { + "model": "meta-llama/Llama-3.2-1B-Instruct", + "type": "open-source", + "fewshots": 5, + "backend": "vllm", + "accuracy": 0.2678, + "date": "2026-02-24" + } +] \ No newline at end of file diff --git a/docs/_static/scripts/front_page.js b/docs/_static/scripts/front_page.js index 03fd423..4f2cdc5 100644 --- a/docs/_static/scripts/front_page.js +++ b/docs/_static/scripts/front_page.js @@ -48,3 +48,86 @@ if (searchInput) { } }); } + +document.addEventListener("DOMContentLoaded", async () => { + const container = document.getElementById('leaderboard-container'); + if (!container) return; + + try { + const response = await fetch('_static/leaderboard.json'); + const rows = await response.json(); + renderLeaderboard(rows); + } catch (e) { + container.innerHTML = '

    Error loading leaderboard 😢

    '; + console.error(e); + } +}); + +function renderLeaderboard(rows) { + const container = document.getElementById('leaderboard-container'); + container.innerHTML = ''; + + const table = document.createElement('table'); + table.className = 'leaderboard-table'; + + const thead = document.createElement('thead'); + thead.innerHTML = ` + + Rank + Model + Type + Fewshots + Backend + Accuracy + Date + `; + table.appendChild(thead); + + const tbody = document.createElement('tbody'); + rows.forEach((row, i) => { + const tr = document.createElement('tr'); + + const modelName = row.model.includes('/') ? row.model.split('/')[1] : row.model; + + const modelCell = document.createElement('td'); + if (row.url) { + const a = document.createElement('a'); + a.href = row.url; + a.target = "_blank"; + a.rel = "noopener"; + a.textContent = modelName; // <-- здесь только вторая часть + modelCell.appendChild(a); + } else { + modelCell.textContent = modelName; + } + + // Accuracy + const accuracyCell = document.createElement('td'); + const barContainer = document.createElement('div'); + barContainer.className = 'accuracy-bar'; + const fill = document.createElement('div'); + fill.className = 'fill'; + fill.style.width = `${(row.accuracy*100).toFixed(2)}%`; + const text = document.createElement('span'); + text.textContent = `${(row.accuracy*100).toFixed(2)}%`; + barContainer.appendChild(fill); + barContainer.appendChild(text); + accuracyCell.appendChild(barContainer); + + + tr.innerHTML += `${i+1}`; + tr.appendChild(modelCell); + tr.innerHTML += ` + ${row.type} + ${row.fewshots} + ${row.backend} + `; + tr.appendChild(accuracyCell); + tr.innerHTML += `${row.date}`; + + tbody.appendChild(tr); + }); + + table.appendChild(tbody); + container.appendChild(table); +} \ No newline at end of file diff --git a/docs/_static/styles/front_page.css b/docs/_static/styles/front_page.css index 55bce1c..1d3bcfb 100644 --- a/docs/_static/styles/front_page.css +++ b/docs/_static/styles/front_page.css @@ -248,3 +248,70 @@ pre span { background: none !important; color: inherit !important; } + +.leaderboard-box { + padding: 1rem; + background: #fff; + border-radius: 12px; + box-shadow: 0 6px 20px rgba(0,0,0,0.08); + overflow-x: auto; +} + +.leaderboard-table { + width: 100%; + border-collapse: collapse; + font-family: 'Inter', 'Roboto', sans-serif; + font-size: 0.95rem; + text-align: center; +} + +.leaderboard-table th { + background: linear-gradient(180deg, #f6f6f6 0%, #e9e9e9 100%); + color: #111827; + font-weight: 600; + font-size: 0.95rem; + padding: 14px 10px; + text-transform: uppercase; + letter-spacing: 0.5px; + border-bottom: 2px solid #ddd; + text-align: center; + box-shadow: inset 0 -1px 0 rgba(0,0,0,0.05); +} + +.leaderboard-table td { + padding: 10px; + border-bottom: 1px solid #e0e0e0; +} + +.leaderboard-table tbody tr:nth-child(even) { + background-color: #f9f9f9; +} + +/* Accuracy bar */ +.accuracy-bar { + position: relative; + width: 100%; + height: 20px; + background: #e0e0e0; + border-radius: 10px; + overflow: hidden; +} + +.accuracy-bar .fill { + height: 100%; + background: linear-gradient(90deg,#4caf50,#81c784); + border-radius: 10px 0 0 10px; +} + +.accuracy-bar span { + position: absolute; + width: 100%; + text-align: center; + top: 0; + left: 0; + font-size: 0.8rem; + font-weight: 600; + line-height: 20px; + color: #000; +} + diff --git a/docs/_templates/index.html b/docs/_templates/index.html index 65bcb22..a93b256 100644 --- a/docs/_templates/index.html +++ b/docs/_templates/index.html @@ -153,15 +153,9 @@ -

    📊 Leaderboard [in progress]

    -
    -

    - The official Leaderboard is currently empty and in progress. - - Submit - - your model results to be the first on the ranking! -

    +

    📊 Leaderboard — Execution Accuracy (EX)

    +
    +

    Loading leaderboard...

    @@ -169,7 +163,7 @@

    📄 Citation

    @inproceedings{llmsql_bench,
       title={LLMSQL: Upgrading WikiSQL for the LLM Era of Text-to-SQL},
       author={Pihulski, Dzmitry and Charchut, Karol and Novogrodskaia, Viktoria and Koco{'n}, Jan},
    -  booktitle={2025 IEEE ICDMW},
    +  booktitle={2025 IEEE ICувцDMW},
       year={2025},
       organization={IEEE}
     }
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..1049e44
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..7aaea2e
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-1B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.2-1B-Instruct
    +  revision: main
    +  commit_hash: 9213176726f574b556790deb65791e0c5aa438b6
    +  parameter_count: 1B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.2678
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-1B-Instruct/5fewshots/Llama-3.2-1B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..a3400c8
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..57616a3
    --- /dev/null
    +++ b/leaderboard/Llama-3.2-3B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.2-3B-Instruct
    +  revision: main
    +  commit_hash: 0cb88a4f764b7a12671c53f0838cd831a0843b95
    +  parameter_count: 3B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.5415
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.2-3B-Instruct/5fewshots/Llama-3.2-3B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..4efc97e
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..1e966f9
    --- /dev/null
    +++ b/leaderboard/Llama-3.3-70B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: meta-llama/Llama-3.3-70B-Instruct
    +  revision: main
    +  commit_hash: 6f6073b423013f6a7d4d9f39144961bfbfbc386b
    +  parameter_count: 70B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8607
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Llama-3.3-70B-Instruct/5fewshots/Llama-3.3-70B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..ef0562c
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "mistralai/Mistral-Nemo-Instruct-2407"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
    new file mode 100644
    index 0000000..7914a99
    --- /dev/null
    +++ b/leaderboard/Mistral-Nemo-Instruct-2407/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: mistralai/Mistral-Nemo-Instruct-2407
    +  revision: main
    +  commit_hash: 04d8a90549d23fc6bd7f642064003592df51e9b3
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.7599
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Mistral-Nemo-Instruct-2407/5fewshots/Mistral-Nemo-Instruct-2407_outputs.jsonl
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..fa7cfb7
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
    new file mode 100644
    index 0000000..a2d5154
    --- /dev/null
    +++ b/leaderboard/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/run.yaml
    @@ -0,0 +1,59 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
    +  revision: main
    +  commit_hash: 5a48de7e98cce824b3456eb9857ded839c3b6475
    +  parameter_count: 30B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8519
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/5fewshots/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16_outputs.jsonl
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..59bfa27
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-chat"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
    new file mode 100644
    index 0000000..6a2ab5f
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-chat/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/PLLuM-12B-chat
    +  revision: main
    +  commit_hash: 74d80ff96552d9555f6f6f28321433da3895d2ec
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.5224
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-chat/5fewshots/PLLuM-12B-chat_outputs.jsonl
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..372e696
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/PLLuM-12B-nc-chat"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
    new file mode 100644
    index 0000000..819021e
    --- /dev/null
    +++ b/leaderboard/PLLuM-12B-nc-chat/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/PLLuM-12B-nc-chat
    +  revision: main
    +  commit_hash: 7089352cfc2efbd2d3c64cc8cd5c97cd2c4fc013
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.4044
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/PLLuM-12B-nc-chat/5fewshots/PLLuM-12B-nc-chat_outputs.jsonl
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..c8af571
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..47c8f25
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-1.5B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,57 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen2.5-1.5B-Instruct
    +  revision: main
    +  commit_hash: 989aa7980e4cf806f80c7fef2b1adb7bc71aa306
    +  parameter_count: 1.5B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.6401
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-1.5B-Instruct/5fewshots/Qwen2.5-1.5B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..e463467
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
    new file mode 100644
    index 0000000..0492a50
    --- /dev/null
    +++ b/leaderboard/Qwen2.5-7B-Instruct/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen2.5-7B-Instruct
    +  revision: main
    +  commit_hash: a09a35458c702b33eeacc393d103063234e8bc28
    +  parameter_count: 7B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.7940
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen2.5-7B-Instruct/5fewshots/Qwen2.5-7B-Instruct_outputs.jsonl
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..30b281f
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "Qwen/Qwen3-0.6B"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=0.6,
    +    sampling_kwargs={"top_p": 0.95, "top_k": 20, "min_p": 0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/Qwen3-0.6B/5fewshots/run.yaml b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
    new file mode 100644
    index 0000000..ba714f8
    --- /dev/null
    +++ b/leaderboard/Qwen3-0.6B/5fewshots/run.yaml
    @@ -0,0 +1,60 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: Qwen/Qwen3-0.6B
    +  revision: main
    +  commit_hash: c1899de289a04d12100db370d81485cdf75e47ca
    +  parameter_count: 0.6B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 0.6
    +    sampling_kwargs:
    +      top_p: 0.95
    +      top_k: 20
    +      min_p: 0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.4983
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/Qwen3-0.6B/5fewshots/Qwen3-0.6B_outputs.jsonl
    diff --git a/leaderboard/docs/_build/html/_static/leaderboard.json b/leaderboard/docs/_build/html/_static/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/docs/_build/html/_static/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/docs/_static/leaderboard.json b/leaderboard/docs/_static/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/docs/_static/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/generate_leaderboard.py b/leaderboard/generate_leaderboard.py
    new file mode 100644
    index 0000000..b795556
    --- /dev/null
    +++ b/leaderboard/generate_leaderboard.py
    @@ -0,0 +1,34 @@
    +import yaml
    +import json
    +from pathlib import Path
    +import shutil
    +
    +BASE_DIR = Path(__file__).parent
    +DOCS_DIR = Path(__file__).parent.parent / "docs/_static"
    +BUILD_DIR = DOCS_DIR / "_build/html"
    +
    +rows = []
    +
    +for path in BASE_DIR.rglob("run.yaml"):
    +    with open(path) as f:
    +        data = yaml.safe_load(f)
    +
    +    rows.append({
    +        "model": data["model"]["name"],
    +        "type": data.get("type", ""),
    +        "fewshots": data["inference"]["arguments"]["num_fewshots"],
    +        "backend": data["inference"]["backend"],
    +        "accuracy": data["results"]["execution_accuracy"],
    +        "date": str(data["date"]),
    +    })
    +
    +rows.sort(key=lambda x: x["accuracy"], reverse=True)
    +
    +json_file = DOCS_DIR / "leaderboard.json"
    +with open(json_file, "w") as f:
    +    json.dump(rows, f, indent=2)
    +
    +if BUILD_DIR.exists():
    +    shutil.copy(json_file, BUILD_DIR / "leaderboard.json")
    +
    +print(f"✅ leaderboard.json in {json_file}")
    \ No newline at end of file
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/inference_script.py b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..e99d509
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "openai/gpt-oss-120b"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/requirements.txt b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/gpt-oss-120b/5fewshots/run.yaml b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
    new file mode 100644
    index 0000000..fe878e3
    --- /dev/null
    +++ b/leaderboard/gpt-oss-120b/5fewshots/run.yaml
    @@ -0,0 +1,60 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: openai/gpt-oss-120b
    +  revision: main
    +  commit_hash: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
    +  parameter_count: 120B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 0.95
    +      repetition_penalty: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.9049
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-120b/5fewshots/gpt-oss-120b_outputs.jsonl
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/inference_script.py b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..3125855
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/inference_script.py
    @@ -0,0 +1,26 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "openai/gpt-oss-20b"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=True,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=1024,
    +    temperature=1.0,
    +    sampling_kwargs={"top_p": 0.95, "repetition_penalty": 1.0},
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/requirements.txt b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/gpt-oss-20b/5fewshots/run.yaml b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
    new file mode 100644
    index 0000000..74b6638
    --- /dev/null
    +++ b/leaderboard/gpt-oss-20b/5fewshots/run.yaml
    @@ -0,0 +1,59 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: openai/gpt-oss-20b
    +  revision: main
    +  commit_hash: 6cee5e81ee83917806bbde320786a8fb61efebee
    +  parameter_count: 20B
    +  dtype: bfloat16
    +  thinking: true
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: true
    +    max_new_tokens: 1024
    +    temperature: 1.0
    +    sampling_kwargs:
    +      top_p: 0.95
    +      repetition_penalty: 1.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.8871
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/gpt-oss-20b/5fewshots/gpt-oss-20b_outputs.jsonl
    diff --git a/leaderboard/leaderboard.json b/leaderboard/leaderboard.json
    new file mode 100644
    index 0000000..c842948
    --- /dev/null
    +++ b/leaderboard/leaderboard.json
    @@ -0,0 +1,106 @@
    +[
    +  {
    +    "model": "openai/gpt-oss-120b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.9049,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "openai/gpt-oss-20b",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8871,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.3-70B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8607,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.8519,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-7B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.794,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "mistralai/Mistral-Nemo-Instruct-2407",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.7599,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen2.5-1.5B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.6401,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-3B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5415,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.5224,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "Qwen/Qwen3-0.6B",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4983,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.4044,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "CYFRAGOVPL/pllum-12b-nc-chat-250715",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.3727,
    +    "date": "2026-02-24"
    +  },
    +  {
    +    "model": "meta-llama/Llama-3.2-1B-Instruct",
    +    "type": "open-source",
    +    "fewshots": 5,
    +    "backend": "vllm",
    +    "accuracy": 0.2678,
    +    "date": "2026-02-24"
    +  }
    +]
    \ No newline at end of file
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
    new file mode 100644
    index 0000000..c2e8457
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/inference_script.py
    @@ -0,0 +1,25 @@
    +import os
    +
    +from dotenv import load_dotenv
    +
    +from llmsql import evaluate, inference_vllm
    +
    +load_dotenv()
    +
    +MODEL_NAME = "CYFRAGOVPL/pllum-12b-nc-chat-250715"
    +
    +results = inference_vllm(
    +    model_name=MODEL_NAME,
    +    output_file=f"{MODEL_NAME}_outputs.jsonl",
    +    batch_size=20000,
    +    tensor_parallel_size=4,
    +    do_sample=False,
    +    hf_token=os.environ["HF_TOKEN"],
    +    max_new_tokens=256,
    +    temperature=0.0,
    +    num_fewshots=5,
    +    seed=42,
    +    llm_kwargs={"dtype": "bfloat16"},
    +)
    +
    +evaluate(results)
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
    new file mode 100644
    index 0000000..929f583
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/requirements.txt
    @@ -0,0 +1,172 @@
    +accelerate==1.12.0
    +aiohappyeyeballs==2.6.1
    +aiohttp==3.13.3
    +aiosignal==1.4.0
    +annotated-doc==0.0.4
    +annotated-types==0.7.0
    +anthropic==0.83.0
    +anyio==4.12.1
    +apache-tvm-ffi==0.1.8.post2
    +astor==0.8.1
    +attrs==25.4.0
    +blake3==1.0.8
    +cachetools==7.0.1
    +cbor2==5.8.0
    +certifi==2026.1.4
    +cffi==2.0.0
    +charset-normalizer==3.4.4
    +click==8.3.1
    +cloudpickle==3.1.2
    +compressed-tensors==0.13.0
    +cryptography==46.0.5
    +cuda-bindings==13.1.1
    +cuda-pathfinder==1.3.5
    +cuda-python==13.1.1
    +cupy-cuda12x==14.0.1
    +datasets==4.5.0
    +depyf==0.20.0
    +dill==0.4.0
    +diskcache==5.6.3
    +distro==1.9.0
    +dnspython==2.8.0
    +docstring_parser==0.17.0
    +einops==0.8.2
    +email-validator==2.3.0
    +fastapi==0.132.0
    +fastapi-cli==0.0.23
    +fastapi-cloud-cli==0.13.0
    +fastar==0.8.0
    +filelock==3.24.3
    +flashinfer-python==0.6.1
    +frozenlist==1.8.0
    +fsspec==2025.10.0
    +gguf==0.17.1
    +grpcio==1.78.1
    +grpcio-reflection==1.78.1
    +h11==0.16.0
    +hf-xet==1.3.0
    +httpcore==1.0.9
    +httptools==0.7.1
    +httpx==0.28.1
    +httpx-sse==0.4.3
    +huggingface_hub==0.36.2
    +idna==3.11
    +ijson==3.5.0
    +interegular==0.3.3
    +Jinja2==3.1.6
    +jiter==0.13.0
    +jmespath==1.1.0
    +jsonschema==4.26.0
    +jsonschema-specifications==2025.9.1
    +lark==1.2.2
    +llguidance==1.3.0
    +llmsql==0.1.15
    +llvmlite==0.44.0
    +lm-format-enforcer==0.11.3
    +loguru==0.7.3
    +markdown-it-py==4.0.0
    +MarkupSafe==3.0.3
    +mcp==1.26.0
    +mdurl==0.1.2
    +mistral_common==1.9.1
    +model-hosting-container-standards==0.1.13
    +mpmath==1.3.0
    +msgpack==1.1.2
    +msgspec==0.20.0
    +multidict==6.7.1
    +multiprocess==0.70.18
    +networkx==3.6.1
    +ninja==1.13.0
    +numba==0.61.2
    +numpy==2.2.6
    +nvidia-cublas-cu12==12.8.4.1
    +nvidia-cuda-cupti-cu12==12.8.90
    +nvidia-cuda-nvrtc-cu12==12.8.93
    +nvidia-cuda-runtime-cu12==12.8.90
    +nvidia-cudnn-cu12==9.10.2.21
    +nvidia-cudnn-frontend==1.18.0
    +nvidia-cufft-cu12==11.3.3.83
    +nvidia-cufile-cu12==1.13.1.3
    +nvidia-curand-cu12==10.3.9.90
    +nvidia-cusolver-cu12==11.7.3.90
    +nvidia-cusparse-cu12==12.5.8.93
    +nvidia-cusparselt-cu12==0.7.1
    +nvidia-cutlass-dsl==4.4.0
    +nvidia-cutlass-dsl-libs-base==4.4.0
    +nvidia-ml-py==13.590.48
    +nvidia-nccl-cu12==2.27.5
    +nvidia-nvjitlink-cu12==12.8.93
    +nvidia-nvshmem-cu12==3.3.20
    +nvidia-nvtx-cu12==12.8.90
    +openai==2.23.0
    +openai-harmony==0.0.8
    +opencv-python-headless==4.13.0.92
    +outlines_core==0.2.11
    +packaging==26.0
    +pandas==3.0.1
    +partial-json-parser==0.2.1.1.post7
    +pillow==12.1.1
    +prometheus-fastapi-instrumentator==7.1.0
    +prometheus_client==0.24.1
    +propcache==0.4.1
    +protobuf==6.33.5
    +psutil==7.2.2
    +py-cpuinfo==9.0.0
    +pyarrow==23.0.1
    +pybase64==1.4.3
    +pycountry==26.2.16
    +pycparser==3.0
    +pydantic==2.12.5
    +pydantic-extra-types==2.11.0
    +pydantic-settings==2.13.1
    +pydantic_core==2.41.5
    +Pygments==2.19.2
    +PyJWT==2.11.0
    +python-dateutil==2.9.0.post0
    +python-dotenv==1.2.1
    +python-json-logger==4.0.0
    +python-multipart==0.0.22
    +PyYAML==6.0.3
    +pyzmq==27.1.0
    +ray==2.54.0
    +referencing==0.37.0
    +regex==2026.2.19
    +requests==2.32.5
    +rich==14.3.3
    +rich-toolkit==0.19.4
    +rignore==0.7.6
    +rpds-py==0.30.0
    +safetensors==0.7.0
    +sentencepiece==0.2.1
    +sentry-sdk==2.53.0
    +setproctitle==1.3.7
    +setuptools==80.10.2
    +shellingham==1.5.4
    +six==1.17.0
    +sniffio==1.3.1
    +sse-starlette==3.2.0
    +starlette==0.52.1
    +supervisor==4.3.0
    +sympy==1.14.0
    +tabulate==0.9.0
    +tiktoken==0.12.0
    +tokenizers==0.22.2
    +torch==2.9.1
    +torchaudio==2.9.1
    +torchvision==0.24.1
    +tqdm==4.67.3
    +transformers==4.57.6
    +triton==3.5.1
    +typer==0.24.1
    +typer-slim==0.24.0
    +typing-inspection==0.4.2
    +typing_extensions==4.15.0
    +urllib3==2.6.3
    +uvicorn==0.41.0
    +uvloop==0.22.1
    +vllm==0.15.1
    +watchfiles==1.1.1
    +websockets==16.0
    +xgrammar==0.1.29
    +xxhash==3.6.0
    +yarl==1.22.0
    diff --git a/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
    new file mode 100644
    index 0000000..6b48d39
    --- /dev/null
    +++ b/leaderboard/pllum-12b-nc-chat-250715/5fewshots/run.yaml
    @@ -0,0 +1,56 @@
    +date: 2026-02-24
    +# =====================
    +# Model Information
    +# =====================
    +model:
    +  name: CYFRAGOVPL/pllum-12b-nc-chat-250715
    +  revision: main
    +  commit_hash: 025e26b3fc5ac1fa8714298e671a6cf2418123d7
    +  parameter_count: 12B
    +  dtype: bfloat16
    +  thinking: false
    +
    +type: open-source  # open-source | proprietary
    +
    +# =====================
    +# Package Information
    +# =====================
    +llmsql:
    +  version: 0.1.15
    +  commit_hash: 79175212c90b1fc094abd2c9666c23d903060014
    +
    +# =====================
    +# Benchmark Information
    +# =====================
    +version: 2.0
    +
    +# =====================
    +# Environment Information
    +# =====================
    +os_name: Ubuntu 24.04.3 LTS
    +python_version: 3.12.12
    +pip_freeze: requirements.txt
    +device: 4xH200
    +
    +# =====================
    +# Function Inputs / Inference Backend
    +# =====================
    +inference:
    +  backend: vllm  # vllm | transformers
    +  arguments:
    +    batch_size: 20000
    +    tetensor_parallel_size: 4
    +    do_sample: false
    +    max_new_tokens: 256
    +    temperature: 0.0
    +    num_fewshots: 5
    +    seed: 42
    +    llm_kwargs:
    +      dtype: bfloat16
    +
    +# =====================
    +# Results
    +# =====================
    +results:
    +  execution_accuracy: 0.3727
    +  answers_path: https://huggingface.co/datasets/llmsql-bench/benchmark-evaluation-results/blob/main/pllum-12b-nc-chat-250715/5fewshots/pllum-12b-nc-chat-250715_outputs.jsonl