Skip to content

Debug Test

Debug Test #19

Workflow file for this run

name: "Debug Test"
on:
workflow_dispatch:
# pull_request:
# push:
# branches: [multinode-firstclass]
jobs:
get-dsr1-configs:
runs-on: ubuntu-latest
outputs:
multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }}
steps:
- name: Checkout code
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- id: get-dsr1-configs
run: |
pip install pydantic
CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --seq-lens 1k1k --model-prefix dsr1)
echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
echo $CONFIG_JSON_MULTI_NODE
# get-gptoss-configs:
# runs-on: ubuntu-latest
# outputs:
# multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }}
# steps:
# - name: Checkout code
# uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
# - id: get-gptoss-configs
# run: |
# pip install pydantic
# CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
# echo "multi-node-search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
benchmark-dsr1-multi-node:
needs: get-dsr1-configs
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: dsr1 1k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
secrets: inherit
with:
isl: 1024
osl: 1024
max-model-len: 2048
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
exp-name: "dsr1_1k1k"
conc: ${{ matrix.config.conc }}
mtp-mode: ${{ matrix.config.mtp }}
additional-settings: ${{ toJson(matrix.config.additional-settings) }}
prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-batch-size: ${{ matrix.config.prefill.batch-size }}
prefill-max-num-tokens: ${{ matrix.config.prefill.max-num-tokens }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-batch-size: ${{ matrix.config.decode.batch-size }}
decode-max-num-tokens: ${{ matrix.config.decode.max-num-tokens }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
# benchmark-gptoss-multi-node:
# needs: get-gptoss-configs
# uses: ./.github/workflows/benchmark-multinode-tmpl.yml
# name: gptoss 1k1k /
# strategy:
# fail-fast: false
# matrix:
# config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
# secrets: inherit
# with:
# exp-name: "gptoss_1k1k"
# isl: 1024
# osl: 1024
# max-model-len: 2048
# runner: ${{ matrix.config.runner }}
# image: ${{ matrix.config.image }}
# model: ${{ matrix.config.model }}
# framework: ${{ matrix.config.framework }}
# precision: ${{ matrix.config.precision }}
# tp: ${{ matrix.config.tp }}
# ep: ${{ matrix.config.ep }}
# dp-attn: ${{ matrix.config.dp-attn }}
# conc: ${{ matrix.config.conc }}
collect-dsr1-results:
needs: benchmark-dsr1-multi-node
if: ${{ always() }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
with:
exp-name: "dsr1_1k1k"
# collect-gptoss-results:
# needs: benchmark-gptoss-multi-node
# if: ${{ always() }}
# uses: ./.github/workflows/collect-results.yml
# secrets: inherit
# with:
# exp-name: "gptoss_1k1k"
calc-success-rate:
needs: [collect-dsr1-results]
if: ${{ always() }}
runs-on: ubuntu-latest
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0
- name: Download results artifacts
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
path: ${{ env.RESULTS_DIR }}
pattern: results_*
- name: Install python dependencies
run: pip install PyGithub
- name: Calculate success rate
run: python3 utils/calc_success_rate.py $STATS_FILENAME
- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: "run-stats"
path: ${{ env.STATS_FILENAME }}.json