Skip to content

Commit f71349a

Browse files
author
jetstream authors
committed
Merge pull request #254 from AI-Hypercomputer:yuyan-stable-stack
PiperOrigin-RevId: 750626566
2 parents 9cb7785 + a789475 commit f71349a

9 files changed

Lines changed: 413 additions & 0 deletions

File tree

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
FROM alpine/git:2.47.2 AS maxtext_cloner
17+
18+
ARG MAXTEXT_COMMIT_HASH
19+
20+
WORKDIR /src
21+
22+
RUN \
23+
git clone --depth=1 https://github.com/AI-Hypercomputer/maxtext.git && \
24+
if [ -n "${MAXTEXT_COMMIT_HASH}" ]; then \
25+
cd maxtext && \
26+
git fetch origin ${MAXTEXT_COMMIT_HASH} && \
27+
git switch --detach ${MAXTEXT_COMMIT_HASH}; \
28+
fi
29+
30+
31+
FROM alpine/git:2.47.2 AS jetstream_cloner
32+
33+
ARG JETSTREAM_COMMIT_HASH
34+
35+
WORKDIR /src
36+
RUN \
37+
git clone --depth=1 https://github.com/AI-Hypercomputer/JetStream.git && \
38+
if [ -n "${JETSTREAM_COMMIT_HASH}" ]; then \
39+
cd JetStream && \
40+
git fetch origin ${JETSTREAM_COMMIT_HASH} && \
41+
git switch --detach ${JETSTREAM_COMMIT_HASH}; \
42+
fi
43+
44+
FROM python:3.10-slim-bullseye AS runner
45+
46+
WORKDIR /jetstream_maxtext_stable_stack
47+
48+
# Environment variable for no-cache-dir and pip root user warning
49+
ENV PIP_NO_CACHE_DIR=1
50+
ENV PIP_ROOT_USER_ACTION=ignore
51+
52+
# Set environment variables for Google Cloud SDK and Python 3.10
53+
ENV PYTHON_VERSION=3.10
54+
ENV CLOUD_SDK_VERSION=latest
55+
56+
# Set DEBIAN_FRONTEND to noninteractive to avoid frontend errors
57+
ENV DEBIAN_FRONTEND=noninteractive
58+
59+
RUN apt-get update \
60+
&& \
61+
apt-get install -y --no-install-recommends git git-lfs \
62+
&& \
63+
rm -rf /var/lib/apt/lists/*
64+
65+
RUN python3 -m pip install --upgrade pip
66+
67+
# Install MaxText package
68+
COPY --from=maxtext_cloner /src .
69+
RUN cd maxtext && bash setup.sh
70+
71+
# MaxText install jetstream from the main. Need overwrite it.
72+
# Install JetStream requirements
73+
COPY --from=jetstream_cloner /src .
74+
RUN python3 -m pip install ./JetStream
75+
RUN python3 -m pip install -r ./JetStream/benchmarks/requirements.in
76+
77+
COPY generate_manifest.sh .
78+
RUN \
79+
bash ./generate_manifest.sh \
80+
PREFIX=jetstream_maxtext \
81+
MAXTEXT_COMMIT_HASH=$(git -C ./maxtext rev-parse HEAD) \
82+
JETSTREAM_COMMIT_HASH=$(git -C ./JetStream rev-parse HEAD)
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Jetstream MaxText Stable Stack
2+
3+
This provides a stable Docker image stack for running MaxText using JetStream on Cloud TPUs for inference.
4+
5+
## Overview
6+
7+
The goal of this project is to offer a reliable and up-to-date environment for deploying and serving MaxText efficiently on TPU hardware via the JetStream inference server.
8+
9+
## Getting Started
10+
11+
### Prerequisites
12+
13+
- Docker installed on your machine or VM.
14+
- Access to Google Cloud Platform and authenticated `gcloud` CLI (if pulling from GCR).
15+
- Access to TPU resources configured for your project.
16+
17+
### Pulling the Image
18+
19+
The stable stack is available as a nightly Docker image hosted on Google Container Registry (GCR). To pull the latest nightly image, replace `YYYYMMDD` with the desired date (e.g., `20231027`):
20+
21+
```bash
22+
# Replace YYYYMMDD with the specific date, e.g., 20231027
23+
export NIGHTLY_DATE=$(date +"%Y%m%d") # Or set manually, e.g., export NIGHTLY_DATE=20231027
24+
25+
docker pull gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:nightly-${NIGHTLY_DATE}
26+
27+
# Or the last nightly build
28+
docker pull gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:nightly
29+
```
30+
31+
## Running the Container
32+
33+
Run on the TPU VM.
34+
35+
```bash
36+
docker run --net=host --privileged --rm -it \
37+
# Add necessary volume mounts, TPU device access, network ports, etc.
38+
gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:nightly \
39+
bash
40+
```
41+
42+
## Image Information
43+
44+
- Registry: Google Container Registry (GCR)
45+
- Path: gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu
46+
- Tagging Scheme: nightly-YYYYMMDD (e.g., nightly-20231027)
47+
48+
A new image is built nightly, incorporating the latest updates and dependencies for the JetStream-MaxText stack on TPUs. Use the tag corresponding to the date you wish to use.
49+
50+
## Build the Image
51+
52+
- build.sh build the local docker image
53+
- test.sh test all the .sh in test_script using the built image
54+
- pipeline.sh build, test and upload the image if all success.
55+
56+
```bash
57+
./pipeline.sh UPLOAD_IMAGE_TAG=gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:nightly-$(date +"%Y%m%d")
58+
```
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
# Copyright 2025 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -xe
17+
18+
export LOCAL_IMAGE_TAG=${LOCAL_IMAGE_TAG:-jetstream-maxtext-stable-stack:latest}
19+
export MAXTEXT_COMMIT_HASH=${MAXTEXT_COMMIT_HASH}
20+
export JETSTREAM_COMMIT_HASH=${JETSTREAM_COMMIT_HASH}
21+
22+
# Set environment variables
23+
for ARGUMENT in "$@"; do
24+
IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
25+
export "$KEY"="$VALUE"
26+
done
27+
28+
29+
if [[ -z "$LOCAL_IMAGE_TAG" ]]; then
30+
echo -e "\n\nError: You must specify an LOCAL_IMAGE_TAG.\n\n"
31+
exit 1
32+
fi
33+
34+
docker build --no-cache \
35+
--build-arg MAXTEXT_COMMIT_HASH=${MAXTEXT_COMMIT_HASH} \
36+
--build-arg JETSTREAM_COMMIT_HASH="${JETSTREAM_COMMIT_HASH}" \
37+
-t ${LOCAL_IMAGE_TAG} \
38+
-f ./Dockerfile .
39+
40+
echo "********* Sucessfully built Stable Stack Image with tag $LOCAL_IMAGE_TAG *********"
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
# Copyright 2025 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# This script generates a manifest of currently installed Python packages, along with their versions.
17+
# The manifest is named with a timestamp for easy versioning and tracking.
18+
19+
export PREFIX='default'
20+
21+
for ARGUMENT in "$@"; do
22+
IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
23+
export "$KEY"="$VALUE"
24+
echo "$KEY"="$VALUE"
25+
done
26+
27+
# Set the Manifest file name with the date for versioning
28+
TIMESTAMP=$(date +"%Y%m%d-%H%M%S")
29+
MANIFEST_FILE="${PREFIX}_manifest_${TIMESTAMP}.txt"
30+
31+
# Freeze packages installed and their version to the Manifest file, with sorted and commented Manifest
32+
pip freeze | sort > "$MANIFEST_FILE"
33+
34+
# Maxtext depend on main branch of jetstream we don't want.
35+
# Remove google-jetstream from the Manifest file
36+
grep -vE '^google-jetstream(==|>=|<=|>|<| |@|$)' "$MANIFEST_FILE" > temp && mv temp "$MANIFEST_FILE"
37+
38+
# Write commit details to the Manifest file
39+
if [[ -n "$MAXTEXT_COMMIT_HASH" ]]; then
40+
echo "# maxtext commit hash: $MAXTEXT_COMMIT_HASH" | cat - "$MANIFEST_FILE" > temp && mv temp "$MANIFEST_FILE"
41+
fi
42+
if [[ -n "$JETSTREAM_COMMIT_HASH" ]]; then
43+
echo "# JetStream commit hash: $JETSTREAM_COMMIT_HASH" | cat - "$MANIFEST_FILE" > temp && mv temp "$MANIFEST_FILE"
44+
fi
45+
46+
# Add a header comment to the Manifest file
47+
echo "# Python Packages Frozen at: ${TIMESTAMP}" | cat - "$MANIFEST_FILE" > temp && mv temp "$MANIFEST_FILE"
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
# Copyright 2025 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -xe
17+
18+
export LOCAL_IMAGE_TAG="jetstream-maxtext-stable-stack:nightly"
19+
export MAXTEXT_COMMIT_HASH=""
20+
export JETSTREAM_COMMIT_HASH=""
21+
export UPLOAD_IMAGE_TAG=""
22+
23+
# Set environment variables
24+
for ARGUMENT in "$@"; do
25+
IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
26+
export "$KEY"="$VALUE"
27+
done
28+
29+
if [[ -z "$UPLOAD_IMAGE_TAG" ]]; then
30+
echo -e "\n\nError: You must specify an UPLOAD_IMAGE_TAG.\n\n"
31+
exit 1
32+
fi
33+
34+
35+
docker_image_upload()
36+
{
37+
local nightly_tag=${UPLOAD_IMAGE_TAG%:*}:nightly
38+
docker tag ${LOCAL_IMAGE_TAG} ${UPLOAD_IMAGE_TAG}
39+
docker tag ${LOCAL_IMAGE_TAG} ${nightly_tag}
40+
docker push ${UPLOAD_IMAGE_TAG}
41+
docker push ${nightly_tag}
42+
echo "All done, check out your artifacts at: ${UPLOAD_IMAGE_TAG}"
43+
}
44+
45+
gcloud auth configure-docker us-docker.pkg.dev --quiet
46+
./build.sh
47+
./test.sh
48+
docker_image_upload
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
# Copyright 2025 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Docker image name to use for executing test scripts
17+
export LOCAL_IMAGE_TAG=${LOCAL_IMAGE_TAG}
18+
19+
# Set environment variables
20+
for ARGUMENT in "$@"; do
21+
IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
22+
export "$KEY"="$VALUE"
23+
done
24+
25+
echo "--- Starting test execution ---"
26+
27+
shopt -s nullglob
28+
test_script_files=(test_script/*.sh)
29+
shopt -u nullglob
30+
31+
echo "Found the following test scripts:"
32+
printf " - %s\n" "${test_script_files[@]}"
33+
34+
declare -a failed_scripts
35+
overall_exit_status=0
36+
37+
for script_path in "${test_script_files[@]}"; do
38+
if [[ -f "$script_path" ]]; then
39+
echo ">>> Running test script: $script_path"
40+
41+
docker run --net=host --privileged --rm -i ${LOCAL_IMAGE_TAG} bash < "$script_path"
42+
script_exit_status=$? # Capture the exit code of the docker run command
43+
44+
if [[ $script_exit_status -ne 0 ]]; then
45+
echo "<<< FAILED test script: $script_path (Exit Code: $script_exit_status)"
46+
failed_scripts+=("$script_path")
47+
overall_exit_status=1
48+
else
49+
echo "<<< Finished test script successfully: $script_path"
50+
fi
51+
echo
52+
else
53+
echo "--- Skipping non-file entry: $script_path ---"
54+
fi
55+
done
56+
57+
echo
58+
59+
if [[ $overall_exit_status -ne 0 ]]; then
60+
echo "--- Test Execution Summary: FAILURES DETECTED ---"
61+
echo "The following scripts failed:"
62+
printf " - %s\n" "${failed_scripts[@]}"
63+
exit 1
64+
else
65+
echo "--- Test Execution Summary: All tests passed successfully ---"
66+
exit 0
67+
fi
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
cd maxtext
2+
3+
LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" \
4+
python -m MaxText.benchmark_chunked_prefill \
5+
MaxText/configs/inference.yml \
6+
tokenizer_path=assets/tokenizer.mistral-v1 \
7+
max_prefill_predict_length=8192 \
8+
max_target_length=8704 \
9+
model_name=mixtral-8x7b \
10+
ici_fsdp_parallelism=1 \
11+
ici_autoregressive_parallelism=1 \
12+
ici_tensor_parallelism=8 \
13+
scan_layers=false \
14+
weight_dtype=bfloat16 \
15+
per_device_batch_size=8 \
16+
megablox=False \
17+
quantization=int8 \
18+
quantize_kvcache=False \
19+
checkpoint_is_quantized=True \
20+
capacity_factor=1 \
21+
attention=dot_product \
22+
model_call_mode=inference \
23+
sparse_matmul=False \
24+
use_chunked_prefill=true \
25+
prefill_chunk_size=2048
26+

0 commit comments

Comments
 (0)