|
| 1 | +name: Streaming Compliance Benchmark |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_dispatch: |
| 5 | + inputs: |
| 6 | + model: |
| 7 | + description: "Model id" |
| 8 | + required: true |
| 9 | + default: "fireworks_ai/accounts/fireworks/models/glm-4p6" |
| 10 | + max_tokens: |
| 11 | + description: "Override max_tokens (integer)" |
| 12 | + required: false |
| 13 | + default: "" |
| 14 | + reasoning_effort: |
| 15 | + description: "Reasoning effort (low|medium|high|none)" |
| 16 | + required: false |
| 17 | + default: "" |
| 18 | + max_rows: |
| 19 | + description: "Max rows for smoke vs full run (integer or 'all')" |
| 20 | + required: false |
| 21 | + default: "" |
| 22 | + temperature: |
| 23 | + description: "Temperature (float)" |
| 24 | + required: false |
| 25 | + default: "" |
| 26 | + stream: |
| 27 | + description: "Enable streaming (true or empty)" |
| 28 | + required: false |
| 29 | + default: "true" |
| 30 | + max_concurrency: |
| 31 | + description: "Max concurrency (integer)" |
| 32 | + required: false |
| 33 | + default: "" |
| 34 | + num_runs: |
| 35 | + description: "Number of runs (integer)" |
| 36 | + required: false |
| 37 | + default: "" |
| 38 | + max_retry: |
| 39 | + description: "Max retry (integer)" |
| 40 | + required: false |
| 41 | + default: "" |
| 42 | + success_threshold: |
| 43 | + description: "Minimum test score needed to pass (float)" |
| 44 | + required: false |
| 45 | + default: "" |
| 46 | + |
| 47 | +jobs: |
| 48 | + streaming-compliance: |
| 49 | + runs-on: 8-core-32gb-ubuntu |
| 50 | + timeout-minutes: 180 |
| 51 | + |
| 52 | + steps: |
| 53 | + - name: Checkout repository |
| 54 | + uses: actions/checkout@v4 |
| 55 | + |
| 56 | + - name: Setup Python |
| 57 | + uses: actions/setup-python@v5 |
| 58 | + with: |
| 59 | + python-version: "3.11" |
| 60 | + |
| 61 | + - name: Setup uv and .venv |
| 62 | + run: | |
| 63 | + python -m pip install --upgrade pip |
| 64 | + pip install uv |
| 65 | + uv venv |
| 66 | + . .venv/bin/activate |
| 67 | + uv pip install --upgrade pip |
| 68 | +
|
| 69 | + - name: Install python-sdk package |
| 70 | + run: | |
| 71 | + . .venv/bin/activate |
| 72 | + uv pip install . |
| 73 | +
|
| 74 | + - name: Run streaming compliance benchmark (pytest) |
| 75 | + env: |
| 76 | + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} |
| 77 | + FIREWORKS_ACCOUNT_ID: ${{ vars.FIREWORKS_ACCOUNT_ID }} |
| 78 | + run: | |
| 79 | + . .venv/bin/activate |
| 80 | + mkdir -p artifacts |
| 81 | +
|
| 82 | + MODEL="${{ github.event.inputs.model }}" |
| 83 | + MAX_TOKENS="${{ github.event.inputs.max_tokens }}" |
| 84 | + REASONING="${{ github.event.inputs.reasoning_effort }}" |
| 85 | + MAX_ROWS="${{ github.event.inputs.max_rows }}" |
| 86 | + TEMPERATURE="${{ github.event.inputs.temperature }}" |
| 87 | + STREAM="${{ github.event.inputs.stream }}" |
| 88 | + NUM_RUNS="${{ github.event.inputs.num_runs }}" |
| 89 | + MAX_CONC="${{ github.event.inputs.max_concurrency }}" |
| 90 | + MAX_RETRY="${{ github.event.inputs.max_retry }}" |
| 91 | + SUCCESS_THRESHOLD="${{ github.event.inputs.success_threshold }}" |
| 92 | +
|
| 93 | + echo "Running streaming compliance with reasoning_effort=${REASONING:-<default>} max_rows=${MAX_ROWS:-<default>} model=${MODEL:-<default>} max_tokens=${MAX_TOKENS:-<default>} temperature=${TEMPERATURE:-<default>} stream=${STREAM:-<default>} num_runs=${NUM_RUNS:-<default>} max_concurrency=${MAX_CONC:-<default>} max_retry=${MAX_RETRY:-<default>} success_threshold=${SUCCESS_THRESHOLD:-<default>}" |
| 94 | +
|
| 95 | + PYTEST_TARGET=eval_protocol.benchmarks.test_glm_streaming_compliance |
| 96 | + PYTEST_ARGS="--pyargs $PYTEST_TARGET -q -s --ep-print-summary --ep-summary-json artifacts/streaming_compliance.json" |
| 97 | + [ -n "$MAX_ROWS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-rows=$MAX_ROWS" |
| 98 | + [ -n "$REASONING" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-reasoning-effort=$REASONING" |
| 99 | + [ -n "$MODEL" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param model=$MODEL" |
| 100 | + [ -n "$MAX_TOKENS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param max_tokens=$MAX_TOKENS" |
| 101 | + [ -n "$TEMPERATURE" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param temperature=$TEMPERATURE" |
| 102 | + [ -n "$STREAM" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-input-param stream=$STREAM" |
| 103 | + [ -n "$NUM_RUNS" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-num-runs=$NUM_RUNS" |
| 104 | + [ -n "$MAX_CONC" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-concurrent-rollouts=$MAX_CONC" |
| 105 | + [ -n "$MAX_RETRY" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-max-retry=$MAX_RETRY" |
| 106 | + [ -n "$SUCCESS_THRESHOLD" ] && PYTEST_ARGS="$PYTEST_ARGS --ep-success-threshold=$SUCCESS_THRESHOLD" |
| 107 | + echo "Running: pytest $PYTEST_ARGS" |
| 108 | + pytest $PYTEST_ARGS |
| 109 | +
|
| 110 | + - name: Upload JSON artifact(s) |
| 111 | + if: always() |
| 112 | + uses: actions/upload-artifact@v4 |
| 113 | + with: |
| 114 | + name: streaming_compliance_json |
| 115 | + path: artifacts/*.json |
| 116 | + if-no-files-found: warn |
| 117 | + retention-days: 14 |
0 commit comments