Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
43654ef
[diffusion] CI: improve diffusion comparison benchmark setting for re…
mickqian Apr 4, 2026
164bc0a
[Fix] Fix nightly tests (#22140)
Fridge003 Apr 5, 2026
07f57fc
Enable IndexCache for DeepSeek V3.2 (#21405)
jinyouzhi Apr 5, 2026
c1927e1
fix: TRT-LLM MHA CUDA illegal address with EAGLE v2 + DP attention (#…
Kangyan-Zhou Apr 5, 2026
1519acf
[Hotfix] Fix router gemm on sm103 (#22134)
Fridge003 Apr 5, 2026
74c9eab
[1/8] [sglang-miles] True on-policy training support for FSDP2 (#18639)
yueming-yuan Apr 6, 2026
453eb15
[2/8] [sglang-miles] R3 (Rollout Routing Replay) DeepEP and MTP suppo…
yueming-yuan Apr 6, 2026
2bc3f68
[3/8] [sglang-miles] Support INT4 QAT for RL (#18565)
yueming-yuan Apr 6, 2026
277147c
[4/8] [sglang-miles] PD disaggregation for RL (#18646)
yueming-yuan Apr 6, 2026
7af1f15
[5/8] [sglang-miles] MTP related fix (#18647)
yueming-yuan Apr 6, 2026
24def86
[6/8] [sglang-miles] tmp fix for vlm training: use legacy_load_mm_dat…
yueming-yuan Apr 6, 2026
69018ba
[7/8] [sglang-miles] support better token id return for TITO (#19731)
guapisolo Apr 6, 2026
3606aec
[8/8] [feat] Support cross turn token in after last user message (#20…
guapisolo Apr 6, 2026
98b5440
[sglang-miles] fix weight checker (#21494)
yueming-yuan Apr 6, 2026
629aa25
P2P Weight Update features for miles (#21278)
JD-ETH Apr 6, 2026
d335128
Fix is_multimodal_gen attr not in v0.5.10 ModelConfig
yueming-yuan Apr 6, 2026
0e3e23e
Fix flashinfer fused_moe with topk>8 (#22201)
Qiaolin-Yu Apr 6, 2026
58fc036
Add maintain-deploy workflow for auto-merging PRs into deploy branch
DavidBellamy Apr 12, 2026
c1a5de8
feat(openai): expose completion token IDs in chat completion responses
DavidBellamy May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions .github/workflows/maintain-deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
name: Maintain deploy branch

on:
schedule:
- cron: '*/15 * * * *'
workflow_dispatch:
inputs:
force:
description: 'Rebuild even if no changes detected'
type: boolean
default: false

env:
BASE_BRANCH: llm360-main

jobs:
check:
runs-on: ubuntu-latest
outputs:
skip: ${{ steps.fingerprint.outputs.skip }}
state: ${{ steps.fingerprint.outputs.state }}
branches: ${{ steps.fingerprint.outputs.branches }}
steps:
- name: Compute desired state and compare
id: fingerprint
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
FORCE: ${{ inputs.force }}
REPO: ${{ github.repository }}
run: |
BASE_SHA=$(gh api "repos/${REPO}/commits/${BASE_BRANCH}" --jq '.sha' 2>/dev/null || echo "unknown")

PR_STATE=$(gh api "repos/${REPO}/pulls?state=open&base=${BASE_BRANCH}&per_page=100" \
--jq "sort_by(.head.ref) | map(.head.ref + \"=\" + .head.sha) | join(\",\")")

BRANCHES=$(gh api "repos/${REPO}/pulls?state=open&base=${BASE_BRANCH}&per_page=100" \
--jq '.[].head.ref' | tr '\n' ' ')

STATE="${BASE_SHA}|${PR_STATE}"
echo "state=$STATE" >> "$GITHUB_OUTPUT"
echo "branches=$BRANCHES" >> "$GITHUB_OUTPUT"
echo "Desired state: $STATE"

CURRENT=$(gh api "repos/${REPO}/commits/deploy" \
--jq '.commit.message' 2>/dev/null \
| grep '^state:' | head -1 | cut -d: -f2- || echo "")

echo "Current state: $CURRENT"

if [ "$CURRENT" = "$STATE" ] && [ "$FORCE" != "true" ]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "::notice::No changes detected, skipping rebuild"
else
echo "skip=false" >> "$GITHUB_OUTPUT"
fi

rebuild:
needs: check
if: needs.check.outputs.skip != 'true'
runs-on: ubuntu-latest
permissions:
contents: write
issues: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Configure git
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"

- name: Build deploy branch
env:
PR_BRANCHES: ${{ needs.check.outputs.branches }}
run: |
git checkout -B deploy "origin/${BASE_BRANCH}"

MERGED=""
FAILED=""
for branch in $PR_BRANCHES; do
echo "Merging $branch..."
if git merge "origin/$branch" --no-edit -m "Deploy: merge $branch"; then
MERGED="$MERGED $branch"
else
echo "::error::Merge conflict on $branch, skipping"
git merge --abort
FAILED="$FAILED $branch"
fi
done

echo "Successfully merged:${MERGED:-<none>}"
if [ -n "$FAILED" ]; then
echo "::warning::Failed to merge (conflicts):$FAILED"
fi
echo "FAILED_BRANCHES=$FAILED" >> "$GITHUB_ENV"

- name: Stamp fingerprint and push
env:
STATE: ${{ needs.check.outputs.state }}
run: |
git commit --allow-empty -m "state:${STATE}"
git push origin deploy --force

- name: Report merge conflicts
if: always() && needs.check.outputs.skip != 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
ISSUE_TITLE="Deploy: merge conflict"
EXISTING=$(gh issue list --label deploy-conflict --state open --json number --jq '.[0].number' 2>/dev/null || echo "")

if [ -z "$FAILED_BRANCHES" ]; then
if [ -n "$EXISTING" ]; then
gh issue close "$EXISTING" --comment "Resolved: all branches merged cleanly."
fi
exit 0
fi

BODY=$(printf "The following branches failed to merge into deploy:\n\n")
for b in $FAILED_BRANCHES; do
BODY=$(printf "%s\n- \`%s\`" "$BODY" "$b")
done
BODY=$(printf "%s\n\nThis issue auto-closes when the next build merges all branches cleanly." "$BODY")

if [ -n "$EXISTING" ]; then
gh issue edit "$EXISTING" --body "$BODY"
else
gh issue create --title "$ISSUE_TITLE" --body "$BODY" --label deploy-conflict
fi
1 change: 1 addition & 0 deletions .github/workflows/nightly-test-nvidia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ jobs:
if: always()
env:
GH_PAT_FOR_NIGHTLY_CI_DATA: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GH_TOKEN: ${{ github.token }}
run: |
python3 scripts/ci/utils/diffusion/generate_diffusion_dashboard.py \
--results comparison-results.json \
Expand Down
3 changes: 3 additions & 0 deletions python/sglang/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def get_is_diffusion_model(model_path: str) -> bool:
if is_known_non_diffusers_diffusion_model(model_path):
return True

if _is_registered_diffusion_model(model_path):
return True

try:
if envs.SGLANG_USE_MODELSCOPE.get():
from modelscope import model_file_download
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/multimodal_gen/runtime/launch_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def launch_server(server_args: ServerArgs, launch_http_server: bool = True):
result_pipes_from_slaves_w.append(w)

# Launch all worker processes
master_port = server_args.master_port or (server_args.master_port + 100)
master_port = server_args.master_port
scheduler_pipe_readers = []
scheduler_pipe_writers = []

Expand Down
38 changes: 14 additions & 24 deletions python/sglang/multimodal_gen/runtime/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,7 @@ class ServerArgs:
)

# Master port for distributed inference
# TODO: do not hard code
master_port: int | None = None
master_port: int = 30005

# http server endpoint config
host: str | None = "127.0.0.1"
Expand Down Expand Up @@ -386,36 +385,27 @@ def _adjust_warmup(self):
"Warmup enabled, the launch time is expected to be longer than usual"
)

@staticmethod
def _require_port(port: int, name: str) -> None:
"""Raise if *port* is occupied (used under ``--strict-ports``)."""
if not is_port_available(port):
raise RuntimeError(
f"{name} port {port} is unavailable and --strict-ports is enabled. "
f"Either use a different port or disable --strict-ports."
)

def _adjust_network_ports(self):
if self.strict_ports:
# Strict mode: fail if port is unavailable
if not is_port_available(self.port):
raise RuntimeError(
f"Port {self.port} is unavailable and --strict-ports is enabled. "
f"Either use a different port or remove --strict-ports to allow auto-selection."
)
if not is_port_available(self.scheduler_port):
raise RuntimeError(
f"Scheduler port {self.scheduler_port} is unavailable and --strict-ports is enabled. "
f"Either use a different port or remove --strict-ports to allow auto-selection."
)
if self.master_port is not None and not is_port_available(self.master_port):
raise RuntimeError(
f"Master port {self.master_port} is unavailable and --strict-ports is enabled. "
f"Either use a different port or remove --strict-ports to allow auto-selection."
)
self._require_port(self.port, "HTTP")
self._require_port(self.scheduler_port, "Scheduler")
self._require_port(self.master_port, "Master")
else:
self.port = self.settle_port(self.port)
initial_scheduler_port = self.scheduler_port + (
random.randint(0, 100) if self.scheduler_port == 5555 else 0
)
self.scheduler_port = self.settle_port(initial_scheduler_port)
initial_master_port = (
self.master_port
if self.master_port is not None
else (30005 + random.randint(0, 100))
)
self.master_port = self.settle_port(initial_master_port, 37)
self.master_port = self.settle_port(self.master_port, 37)

def _adjust_parallelism(self):
if self.tp_size is None:
Expand Down
22 changes: 13 additions & 9 deletions python/sglang/multimodal_gen/test/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,21 @@
"Qwen-Image",
)


def _discover_unit_tests() -> list[str]:
"""Auto-discover all test_*.py files in the unit/ directory."""
unit_dir = Path(__file__).resolve().parent / "unit"
if not unit_dir.is_dir():
return []
return sorted(
f"../unit/{f.name}" for f in unit_dir.glob("test_*.py") if f.is_file()
)


SUITES = {
# no GPU required; safe to run on any CPU-only runner
"unit": [
"../unit/test_sampling_params.py",
"../unit/test_storage.py",
"../unit/test_lora_format_adapter.py",
"../unit/test_server_args.py",
"../unit/test_input_validation.py",
"../unit/test_resolve_prompts.py",
# add new unit tests here
],
# Auto-discovered from test/unit/test_*.py
"unit": _discover_unit_tests(),
"1-gpu": [
"test_server_a.py",
"test_server_b.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ def diffusion_server(case: DiffusionTestCase) -> ServerContext:
if server_args.enable_warmup:
extra_args += " --warmup"

# Strict ports: fail immediately if port is occupied instead of silently
# picking another one (which causes the test client to connect to the wrong server).
extra_args += " --strict-ports"

for arg in server_args.extras:
extra_args += f" {arg}"

Expand Down
15 changes: 8 additions & 7 deletions python/sglang/multimodal_gen/test/server/test_server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,10 @@ def start(self) -> ServerContext:
# Apply custom environment variables
env.update(self.env_vars)

# TODO: unify with run_command
logger.info(f"Running command: {shlex.join(command)}")
cmd_str = shlex.join(command)
# Use print (not logger) so the command always appears in CI output
# regardless of log-level configuration.
print(f"[server-test] Running command: {cmd_str}", flush=True)

process = subprocess.Popen(
command,
Expand Down Expand Up @@ -412,11 +414,10 @@ def _log_pipe(pipe: Any, file: Any) -> None:
log_thread.daemon = True
log_thread.start()

logger.info(
"[server-test] Starting server pid=%s, model=%s, log=%s",
process.pid,
self.model,
stdout_path,
print(
f"[server-test] Starting server pid={process.pid}, "
f"model={self.model}, log={stdout_path}",
flush=True,
)

self._wait_for_ready(process, stdout_path)
Expand Down
10 changes: 10 additions & 0 deletions python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,16 @@ def _init_kv_manager(self) -> CommonKVManager:
)
return kv_manager

def release_memory_occupation(self):
self.queue.clear()
self.retracted_queue.clear()
if hasattr(self.kv_manager, "deregister_buffer_to_engine"):
self.kv_manager.deregister_buffer_to_engine()

def resume_memory_occupation(self):
if hasattr(self.kv_manager, "register_buffer_to_engine"):
self.kv_manager.register_buffer_to_engine()

def add(self, req: Req, is_retracted: bool = False) -> None:
"""Add a request to the pending queue."""
if self._check_if_req_exceed_kv_capacity(req):
Expand Down
13 changes: 13 additions & 0 deletions python/sglang/srt/disaggregation/mooncake/conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,19 @@ def send_kvcache_staged(
)
return ret

def deregister_buffer_to_engine(self):
# Batch deregister KV data buffers
if self.kv_args.kv_data_ptrs:
self.engine.batch_deregister(self.kv_args.kv_data_ptrs)

# Batch deregister auxiliary data buffers
if self.kv_args.aux_data_ptrs:
self.engine.batch_deregister(self.kv_args.aux_data_ptrs)

# Batch deregister state/extra pool data buffers
if self.kv_args.state_data_ptrs:
self.engine.batch_deregister(self.kv_args.state_data_ptrs)

def _transfer_data(self, mooncake_session_id, transfer_blocks):
if not transfer_blocks:
return 0
Expand Down
9 changes: 9 additions & 0 deletions python/sglang/srt/disaggregation/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,15 @@ def pop_bootstrapped(
else:
return bootstrapped_reqs, failed_reqs

def release_memory_occupation(self):
self.queue.clear()
if hasattr(self.kv_manager, "deregister_buffer_to_engine"):
self.kv_manager.deregister_buffer_to_engine()

def resume_memory_occupation(self):
if hasattr(self.kv_manager, "register_buffer_to_engine"):
self.kv_manager.register_buffer_to_engine()


class SchedulerDisaggregationPrefillMixin:
"""
Expand Down
Loading
Loading