Skip to content

Commit 9f6aa7b

Browse files
committed
added model quality gha
1 parent e15d855 commit 9f6aa7b

File tree

4 files changed

+475
-0
lines changed

4 files changed

+475
-0
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
name: Streaming Compliance Benchmark
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
model:
7+
description: "Model id"
8+
required: true
9+
default: "fireworks_ai/accounts/fireworks/models/glm-4p6"
10+
max_tokens:
11+
description: "Override max_tokens (integer)"
12+
required: false
13+
default: ""
14+
reasoning_effort:
15+
description: "Reasoning effort (low|medium|high|none)"
16+
required: false
17+
default: ""
18+
max_rows:
19+
description: "Max rows for smoke vs full run (integer or 'all')"
20+
required: false
21+
default: ""
22+
temperature:
23+
description: "Temperature (float)"
24+
required: false
25+
default: ""
26+
stream:
27+
description: "Enable streaming (true or empty)"
28+
required: false
29+
default: "true"
30+
max_concurrency:
31+
description: "Max concurrency (integer)"
32+
required: false
33+
default: ""
34+
num_runs:
35+
description: "Number of runs (integer)"
36+
required: false
37+
default: ""
38+
max_retry:
39+
description: "Max retry (integer)"
40+
required: false
41+
default: ""
42+
success_threshold:
43+
description: "Minimum test score needed to pass (float)"
44+
required: false
45+
default: ""
46+
47+
jobs:
48+
streaming-compliance:
49+
runs-on: 8-core-32gb-ubuntu
50+
timeout-minutes: 180
51+
52+
steps:
53+
- name: Checkout repository
54+
uses: actions/checkout@v4
55+
56+
- name: Setup Python
57+
uses: actions/setup-python@v5
58+
with:
59+
python-version: "3.11"
60+
61+
- name: Setup uv and .venv
62+
run: |
63+
python -m pip install --upgrade pip
64+
pip install uv
65+
uv venv
66+
. .venv/bin/activate
67+
uv pip install --upgrade pip
68+
69+
- name: Install python-sdk package
70+
run: |
71+
. .venv/bin/activate
72+
uv pip install .
73+
74+
- name: Run streaming compliance benchmark (pytest)
75+
env:
76+
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
77+
FIREWORKS_ACCOUNT_ID: ${{ vars.FIREWORKS_ACCOUNT_ID }}
78+
run: |
79+
. .venv/bin/activate
80+
mkdir -p artifacts
81+
82+
MODEL="${{ github.event.inputs.model }}"
83+
MAX_TOKENS="${{ github.event.inputs.max_tokens }}"
84+
REASONING="${{ github.event.inputs.reasoning_effort }}"
85+
MAX_ROWS="${{ github.event.inputs.max_rows }}"
86+
TEMPERATURE="${{ github.event.inputs.temperature }}"
87+
STREAM="${{ github.event.inputs.stream }}"
88+
NUM_RUNS="${{ github.event.inputs.num_runs }}"
89+
MAX_CONC="${{ github.event.inputs.max_concurrency }}"
90+
MAX_RETRY="${{ github.event.inputs.max_retry }}"
91+
SUCCESS_THRESHOLD="${{ github.event.inputs.success_threshold }}"
92+
93+
echo "Running streaming compliance with reasoning_effort=${REASONING:-<default>} max_rows=${MAX_ROWS:-<default>} model=${MODEL:-<default>} max_tokens=${MAX_TOKENS:-<default>} temperature=${TEMPERATURE:-<default>} stream=${STREAM:-<default>} num_runs=${NUM_RUNS:-<default>} max_concurrency=${MAX_CONC:-<default>} max_retry=${MAX_RETRY:-<default>} success_threshold=${SUCCESS_THRESHOLD:-<default>}"
94+
95+
PYTEST_TARGET=eval_protocol.benchmarks.test_glm_streaming_compliance
96+
PYTEST_ARGS="--pyargs $PYTEST_TARGET -q -s --ep-print-summary --ep-summary-json artifacts/streaming_compliance.json"
97+
[ -n "$MAX_ROWS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-rows=$MAX_ROWS"
98+
[ -n "$REASONING" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-reasoning-effort=$REASONING"
99+
[ -n "$MODEL" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param model=$MODEL"
100+
[ -n "$MAX_TOKENS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param max_tokens=$MAX_TOKENS"
101+
[ -n "$TEMPERATURE" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param temperature=$TEMPERATURE"
102+
[ -n "$STREAM" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param stream=$STREAM"
103+
[ -n "$NUM_RUNS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-num-runs=$NUM_RUNS"
104+
[ -n "$MAX_CONC" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-concurrent-rollouts=$MAX_CONC"
105+
[ -n "$MAX_RETRY" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-retry=$MAX_RETRY"
106+
[ -n "$SUCCESS_THRESHOLD" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-success-threshold=$SUCCESS_THRESHOLD"
107+
echo "Running: pytest $PYTEST_ARGS"
108+
pytest $PYTEST_ARGS
109+
110+
- name: Upload JSON artifact(s)
111+
if: always()
112+
uses: actions/upload-artifact@v4
113+
with:
114+
name: streaming_compliance_json
115+
path: artifacts/*.json
116+
if-no-files-found: warn
117+
retention-days: 14

0 commit comments

Comments
 (0)